diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt index 3251f4ee9a7..0e9181ac55a 100644 --- a/.ci/docker/ci_commit_pins/pytorch.txt +++ b/.ci/docker/ci_commit_pins/pytorch.txt @@ -1 +1 @@ -c42ac54d9e817bf0a0366eb78e6c8beba4d5eff5 +aec9b2ab77389967ef39bb9c10662fd0fe3e185a diff --git a/.ci/docker/ci_commit_pins/torchao.txt b/.ci/docker/ci_commit_pins/torchao.txt new file mode 100644 index 00000000000..768110b82ff --- /dev/null +++ b/.ci/docker/ci_commit_pins/torchao.txt @@ -0,0 +1 @@ +0916b5b29b092afcbf2b898caae49abe80662bac diff --git a/.ci/docker/common/install_linter.sh b/.ci/docker/common/install_linter.sh index 4a796a72d54..d262176e49b 100755 --- a/.ci/docker/common/install_linter.sh +++ b/.ci/docker/common/install_linter.sh @@ -13,3 +13,7 @@ source "$(dirname "${BASH_SOURCE[0]}")/utils.sh" # NB: Install all linter dependencies, the caching of lintrunner init could be # done after Executorch becomes public pip_install -r requirements-lintrunner.txt + +# Install google-java-format +curl -L --retry 3 https://github.com/google/google-java-format/releases/download/v1.23.0/google-java-format_linux-x86-64 > /opt/google-java-format +chmod +x /opt/google-java-format diff --git a/.ci/scripts/build-qnn-sdk.sh b/.ci/scripts/build-qnn-sdk.sh index 260072f7342..deeaed34ac3 100644 --- a/.ci/scripts/build-qnn-sdk.sh +++ b/.ci/scripts/build-qnn-sdk.sh @@ -6,11 +6,12 @@ # LICENSE file in the root directory of this source tree. set -eux +set -o xtrace build_qnn_backend() { echo "Start building qnn backend." export ANDROID_NDK_ROOT=/opt/ndk - export QNN_SDK_ROOT=/tmp/qnn/2.23.0.240531 + export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728 export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)" bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release @@ -26,8 +27,9 @@ set_up_aot() { -DCMAKE_INSTALL_PREFIX=$PWD \ -DEXECUTORCH_BUILD_QNN=ON \ -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \ - -DEXECUTORCH_BUILD_SDK=ON \ + -DEXECUTORCH_BUILD_DEVTOOLS=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ -DPYTHON_EXECUTABLE=python3 \ -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF diff --git a/.ci/scripts/build_llama_android.sh b/.ci/scripts/build_llama_android.sh index 644fc4c2bb7..7d3370ee561 100644 --- a/.ci/scripts/build_llama_android.sh +++ b/.ci/scripts/build_llama_android.sh @@ -22,8 +22,9 @@ install_executorch_and_backend_lib() { -DANDROID_PLATFORM=android-23 \ -DCMAKE_INSTALL_PREFIX=cmake-android-out \ -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ diff --git a/.ci/scripts/setup-ios.sh b/.ci/scripts/setup-ios.sh new file mode 100755 index 00000000000..519cd2581eb --- /dev/null +++ b/.ci/scripts/setup-ios.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -exu + +# This script follows the instructions from GitHub to install an Apple certificate +# https://docs.github.com/en/actions/use-cases-and-examples/deploying/installing-an-apple-certificate-on-macos-runners-for-xcode-development + +CERTIFICATE_PATH="${RUNNER_TEMP}"/build_certificate.p12 +PP_PATH="${RUNNER_TEMP}"/build_pp.mobileprovision +KEYCHAIN_PATH="${RUNNER_TEMP}"/app-signing.keychain-db + +# Import certificate and provisioning profile from secrets +echo -n "$BUILD_CERTIFICATE_BASE64" | base64 --decode -o $CERTIFICATE_PATH +echo -n "$BUILD_PROVISION_PROFILE_BASE64" | base64 --decode -o $PP_PATH + +# Create a temporary keychain +security create-keychain -p "$KEYCHAIN_PASSWORD" $KEYCHAIN_PATH +security set-keychain-settings -lut 21600 $KEYCHAIN_PATH +security unlock-keychain -p "$KEYCHAIN_PASSWORD" $KEYCHAIN_PATH + +# Import certificate to the keychain +security import $CERTIFICATE_PATH -P "" -A -t cert -f pkcs12 -k $KEYCHAIN_PATH +security set-key-partition-list -S apple-tool:,apple: -k "$KEYCHAIN_PASSWORD" $KEYCHAIN_PATH +security list-keychain -d user -s $KEYCHAIN_PATH + +# Apply provisioning profile +mkdir -p ~/Library/MobileDevice/Provisioning\ Profiles +cp $PP_PATH ~/Library/MobileDevice/Provisioning\ Profiles diff --git a/.ci/scripts/setup-linux.sh b/.ci/scripts/setup-linux.sh index 4bccabad5cf..5df4668f65c 100755 --- a/.ci/scripts/setup-linux.sh +++ b/.ci/scripts/setup-linux.sh @@ -20,6 +20,5 @@ fi # As Linux job is running inside a Docker container, all of its dependencies # have already been installed -install_flatc_from_source install_executorch build_executorch_runner "${BUILD_TOOL}" diff --git a/.ci/scripts/setup-macos.sh b/.ci/scripts/setup-macos.sh index 2be7d9efe83..833ba0aafe6 100755 --- a/.ci/scripts/setup-macos.sh +++ b/.ci/scripts/setup-macos.sh @@ -128,7 +128,5 @@ if [[ -z "${GITHUB_RUNNER:-}" ]]; then fi print_cmake_info -install_pytorch_and_domains -install_flatc_from_source install_executorch build_executorch_runner "${BUILD_TOOL}" diff --git a/.ci/scripts/setup-qnn-deps.sh b/.ci/scripts/setup-qnn-deps.sh index 3b39e1aafe3..92ffd07bccc 100644 --- a/.ci/scripts/setup-qnn-deps.sh +++ b/.ci/scripts/setup-qnn-deps.sh @@ -7,14 +7,18 @@ set -ex +verify_pkg_installed() { + echo $(dpkg-query -W --showformat='${Status}\n' $1|grep "install ok installed") +} + install_qnn() { echo "Start installing qnn." QNN_INSTALLATION_DIR=/tmp/qnn mkdir -p "${QNN_INSTALLATION_DIR}" - curl -Lo /tmp/v2.23.0.24.06.24.zip "https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.23.0.24.06.24.zip" + curl -Lo /tmp/v2.25.0.24.07.28.zip "https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.25.0.240728.zip" echo "Finishing downloading qnn sdk." - unzip -qo /tmp/v2.23.0.24.06.24.zip -d /tmp + unzip -qo /tmp/v2.25.0.24.07.28.zip -d /tmp echo "Finishing unzip qnn sdk." @@ -26,4 +30,22 @@ install_qnn() { ls -lah "${QNN_INSTALLATION_DIR}" } +setup_libc++() { + sudo apt-get update + pkgs_to_check=('libc++-dev') + j=0 + while [ $j -lt ${#pkgs_to_check[*]} ]; do + install_status=$(verify_pkg_installed ${pkgs_to_check[$j]}) + if [ "$install_status" == "" ]; then + sudo apt-get install -y ${pkgs_to_check[$j]} + if [[ $? -ne 0 ]]; then + echo "ERROR: Failed to install required packages for libc++" + exit 1 + fi + fi + j=$(( $j +1)); + done +} + +setup_libc++ install_qnn diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh index 4fa8c94905f..2e51866d902 100644 --- a/.ci/scripts/test_llama.sh +++ b/.ci/scripts/test_llama.sh @@ -11,7 +11,7 @@ source "$(dirname "${BASH_SOURCE[0]}")/utils.sh" MODEL_NAME=$1 # stories110M BUILD_TOOL=$2 # buck2 or cmake -DTYPE=$3 # fp16 or fp32 +DTYPE=$3 # fp16, bf16, or fp32 MODE=${4:-"xnnpack+custom"} # portable or xnnpack+custom or xnnpack+custom+qe UPLOAD_DIR=${5:-} if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args @@ -29,7 +29,7 @@ if [[ -z "${BUILD_TOOL:-}" ]]; then fi if [[ -z "${DTYPE:-}" ]]; then - echo "Missing dtype, choose fp16 or fp32, exiting..." + echo "Missing dtype, choose fp16, bf16, or fp32, exiting..." exit 1 fi @@ -75,7 +75,7 @@ echo "COREML option ${COREML}" if [[ "${MODE}" =~ .*qnn.* ]]; then QNN=ON export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)" - export QNN_SDK_ROOT=/tmp/qnn/2.23.0.240531 + export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728 export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang" export PYTHONPATH=".." cp schema/program.fbs exir/_serialize/program.fbs @@ -107,8 +107,9 @@ cmake_install_executorch_libraries() { retry cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Debug \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM="$CUSTOM" \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ @@ -173,6 +174,8 @@ fi EXPORTED_MODEL_NAME="llama2" if [[ "${DTYPE}" == "fp16" ]]; then EXPORTED_MODEL_NAME="${EXPORTED_MODEL_NAME}_h" +elif [[ "${DTYPE}" == "bf16" ]]; then + EXPORTED_MODEL_NAME="${EXPORTED_MODEL_NAME}_bf" elif [[ "${DTYPE}" == "fp32" ]]; then : else diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh index 60589c96d47..8ac87b2302d 100644 --- a/.ci/scripts/test_llava.sh +++ b/.ci/scripts/test_llava.sh @@ -8,44 +8,99 @@ set -exu # shellcheck source=/dev/null +BUILD_TYPE=${1:-Debug} +TARGET_OS=${2:-Native} +BUILD_DIR=${3:-cmake-out} + +echo "Building with BUILD_TYPE: $BUILD_TYPE, TARGET_OS: $TARGET_OS, BUILD_DIR: $BUILD_DIR" + if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then - PYTHON_EXECUTABLE=python3 + PYTHON_EXECUTABLE=python3 fi +TARGET_OS_lower="$(echo "${TARGET_OS}" | awk '{print tolower($0)}')" +if [[ "${TARGET_OS_lower}" == "android" ]]; then + if [[ -z "${ANDROID_NDK}" ]]; then + echo "Set ANDROID_NDK environment variable to build for Android." + exit 1 + fi +fi + +# Number of processes for a parallel build +NPROC=8 +if hash nproc &> /dev/null; then NPROC=$(nproc); fi + +EXECUTORCH_COMMON_CMAKE_ARGS=" \ + -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DEXECUTORCH_ENABLE_LOGGING=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DEXECUTORCH_DO_NOT_USE_CXX11_ABI=ON \ + -DEXECUTORCH_XNNPACK_SHARED_WORKSPACE=ON" + cmake_install_executorch_libraries() { - cmake \ - -DCMAKE_INSTALL_PREFIX=cmake-out \ - -DCMAKE_BUILD_TYPE=Debug \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ - -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ - -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ - -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ - -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DEXECUTORCH_DO_NOT_USE_CXX11_ABI=ON \ - -DEXECUTORCH_XNNPACK_SHARED_WORKSPACE=ON \ - -Bcmake-out . - - - cmake --build cmake-out -j9 --target install --config Debug + cmake \ + ${EXECUTORCH_COMMON_CMAKE_ARGS} \ + -B${BUILD_DIR} . + + cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE} } +cmake_install_executorch_libraries_for_android() { + cmake \ + -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ + -DANDROID_ABI=arm64-v8a \ + -DANDROID_PLATFORM=android-23 \ + ${EXECUTORCH_COMMON_CMAKE_ARGS} \ + -B${BUILD_DIR} . + + cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE} +} + + +LLAVA_COMMON_CMAKE_ARGS=" \ + -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ + -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_XNNPACK=ON" + cmake_build_llava_runner() { dir=examples/models/llava python_lib=$($PYTHON_EXECUTABLE -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') - cmake \ - -DCMAKE_INSTALL_PREFIX=cmake-out \ - -DCMAKE_BUILD_TYPE=Debug \ - -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ - -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DCMAKE_PREFIX_PATH="$python_lib" \ - -Bcmake-out/${dir} \ + cmake \ + ${LLAVA_COMMON_CMAKE_ARGS} \ + -DCMAKE_PREFIX_PATH="$python_lib" \ + -B${BUILD_DIR}/${dir} \ ${dir} + cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${BUILD_TYPE} +} + - cmake --build cmake-out/${dir} -j9 --config Debug +cmake_build_llava_runner_for_android() { + dir=examples/models/llava + python_lib=$($PYTHON_EXECUTABLE -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') + + cmake \ + -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ + -DANDROID_ABI=arm64-v8a \ + -DANDROID_PLATFORM=android-23 \ + ${LLAVA_COMMON_CMAKE_ARGS} \ + -DCMAKE_PREFIX_PATH="$python_lib" \ + -DLLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE=ON \ + -B${BUILD_DIR}/${dir} \ + ${dir} + + cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${BUILD_TYPE} } # only export the one without custom op for now since it's @@ -54,6 +109,13 @@ export_llava() { $PYTHON_EXECUTABLE -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts } +# Download a new image with different size, to test if the model can handle different image sizes +prepare_image_tensor() { + echo "Downloading image" + curl -o basketball.jpg https://upload.wikimedia.org/wikipedia/commons/7/73/Chicago_Bulls_and_New_Jersey_Nets%2C_March_28%2C_1991.jpg + $PYTHON_EXECUTABLE -m executorch.examples.models.llava.image_util --image-path basketball.jpg --output-path image.pt +} + run_and_verify() { NOW=$(date +"%H:%M:%S") echo "Starting to run llava runner at ${NOW}" @@ -69,17 +131,33 @@ run_and_verify() { echo "tokenizer.bin is missing." exit 1 fi - RUNTIME_ARGS="--model_path=llava.pte \ - --tokenizer_path=tokenizer.bin \ - --image_path=image.pt \ - --prompt=ASSISTANT: \ - --temperature=0 \ - --seq_len=650" - cmake-out/examples/models/llava/llava_main ${RUNTIME_ARGS} > result.txt + + + + RUNTIME_ARGS="--model_path=llava.pte \ + --tokenizer_path=tokenizer.bin \ + --image_path=image.pt \ + --prompt=ASSISTANT: \ + --temperature=0 \ + --seq_len=650" + + if [[ "${TARGET_OS_lower}" == "android" ]]; then + echo "Transfer relevant files to the phone via ADB and run llava_main with following args," + echo "$ llava_main ${RUNTIME_ARGS} " + exit 0; + fi + + ${BUILD_DIR}/examples/models/llava/llava_main ${RUNTIME_ARGS} > result.txt + # verify result.txt RESULT=$(cat result.txt) # set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes tokens. - EXPECTED_PREFIX="ASSISTANT:" + if [[ "$(uname)" == "Darwin" ]]; then + EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress, with several players on the court. One of the players is dribbling the ball, while the others are in various" + else + # set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes tokens. + EXPECTED_PREFIX="ASSISTANT:" + fi if [[ "${RESULT}" == *"${EXPECTED_PREFIX}"* ]]; then echo "Expected result prefix: ${EXPECTED_PREFIX}" echo "Actual result: ${RESULT}" @@ -93,7 +171,20 @@ run_and_verify() { fi } -cmake_install_executorch_libraries -cmake_build_llava_runner +# Step1. Build stuff +if [[ "${TARGET_OS_lower}" == "android" ]]; then + cmake_install_executorch_libraries_for_android + cmake_build_llava_runner_for_android +elif [[ "${TARGET_OS_lower}" == "native" ]]; then + cmake_install_executorch_libraries + cmake_build_llava_runner +else + echo "Invalid TARGET_OS ($2): ${TARGET_OS}" +fi + +# Step2. Generate the PTE export_llava + +# Step3. Run +prepare_image_tensor run_and_verify diff --git a/.ci/scripts/test.sh b/.ci/scripts/test_model.sh similarity index 75% rename from .ci/scripts/test.sh rename to .ci/scripts/test_model.sh index 1f20042f02a..f558a508c93 100755 --- a/.ci/scripts/test.sh +++ b/.ci/scripts/test_model.sh @@ -50,13 +50,13 @@ prepare_artifacts_upload() { build_cmake_executor_runner() { echo "Building executor_runner" - (rm -rf ${CMAKE_OUTPUT_DIR} \ - && mkdir ${CMAKE_OUTPUT_DIR} \ - && cd ${CMAKE_OUTPUT_DIR} \ - && retry cmake -DCMAKE_BUILD_TYPE=Release \ - -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..) + rm -rf ${CMAKE_OUTPUT_DIR} + cmake -DCMAKE_BUILD_TYPE=Debug \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ + -B${CMAKE_OUTPUT_DIR} . - cmake --build ${CMAKE_OUTPUT_DIR} -j4 + cmake --build ${CMAKE_OUTPUT_DIR} -j4 --config Debug } run_portable_executor_runner() { @@ -64,9 +64,7 @@ run_portable_executor_runner() { if [[ "${BUILD_TOOL}" == "buck2" ]]; then buck2 run //examples/portable/executor_runner:executor_runner -- --model_path "./${MODEL_NAME}.pte" elif [[ "${BUILD_TOOL}" == "cmake" ]]; then - if [[ ! -f ${CMAKE_OUTPUT_DIR}/executor_runner ]]; then - build_cmake_executor_runner - fi + build_cmake_executor_runner ./${CMAKE_OUTPUT_DIR}/executor_runner --model_path "./${MODEL_NAME}.pte" else echo "Invalid build tool ${BUILD_TOOL}. Only buck2 and cmake are supported atm" @@ -156,9 +154,41 @@ test_model_with_qnn() { export PYTHONPATH=$EXECUTORCH_ROOT/.. if [[ "${MODEL_NAME}" == "dl3" ]]; then - "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.deeplab_v3 -b ${CMAKE_OUTPUT_DIR} -m SM8550 --compile_only --download - EXPORTED_MODEL=./deeplab_v3/dlv3_qnn.pte + EXPORT_SCRIPT=deeplab_v3 + EXPORTED_MODEL_NAME=dlv3_qnn.pte + elif [[ "${MODEL_NAME}" == "mv3" ]]; then + EXPORT_SCRIPT=mobilenet_v3 + EXPORTED_MODEL_NAME=mv3_qnn.pte + elif [[ "${MODEL_NAME}" == "mv2" ]]; then + EXPORT_SCRIPT=mobilenet_v2 + EXPORTED_MODEL_NAME=mv2_qnn.pte + elif [[ "${MODEL_NAME}" == "ic4" ]]; then + EXPORT_SCRIPT=inception_v4 + EXPORTED_MODEL_NAME=ic4_qnn.pte + elif [[ "${MODEL_NAME}" == "ic3" ]]; then + EXPORT_SCRIPT=inception_v3 + EXPORTED_MODEL_NAME=ic3_qnn.pte + elif [[ "${MODEL_NAME}" == "vit" ]]; then + EXPORT_SCRIPT=torchvision_vit + EXPORTED_MODEL_NAME=vit_qnn.pte fi + + # Use SM8450 for S22, SM8550 for S23, and SM8560 for S24 + # TODO(guangyang): Make QNN chipset matches the target device + QNN_CHIPSET=SM8450 + + "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --compile_only + EXPORTED_MODEL=./${EXPORT_SCRIPT}/${EXPORTED_MODEL_NAME} +} + +test_model_with_coreml() { + if [[ "${BUILD_TOOL}" == "buck2" ]]; then + echo "coreml doesn't support buck2." + exit 1 + fi + + "${PYTHON_EXECUTABLE}" -m examples.apple.coreml.scripts.export --model_name="${MODEL_NAME}" + EXPORTED_MODEL=$(find "." -type f -name "${MODEL_NAME}*.pte" -print -quit) } if [[ "${BACKEND}" == "portable" ]]; then @@ -170,9 +200,21 @@ elif [[ "${BACKEND}" == "qnn" ]]; then if [[ $? -eq 0 ]]; then prepare_artifacts_upload fi +elif [[ "${BACKEND}" == "coreml" ]]; then + echo "Testing ${MODEL_NAME} with coreml..." + test_model_with_coreml + if [[ $? -eq 0 ]]; then + prepare_artifacts_upload + fi elif [[ "${BACKEND}" == "xnnpack" ]]; then echo "Testing ${MODEL_NAME} with xnnpack..." - test_model_with_xnnpack true true + WITH_QUANTIZATION=true + WITH_DELEGATION=true + if [[ "$MODEL_NAME" == "mobilebert" ]]; then + # TODO(T197452682) + WITH_QUANTIZATION=false + fi + test_model_with_xnnpack "${WITH_QUANTIZATION}" "${WITH_DELEGATION}" if [[ $? -eq 0 ]]; then prepare_artifacts_upload fi diff --git a/.ci/scripts/test_phi_3_mini.sh b/.ci/scripts/test_phi_3_mini.sh new file mode 100644 index 00000000000..40767013e23 --- /dev/null +++ b/.ci/scripts/test_phi_3_mini.sh @@ -0,0 +1,116 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -exu + +BUILD_TYPE=${1:-Debug} +BUILD_DIR=${3:-cmake-out} +MODEL_DIR=examples/models/phi-3-mini + +echo "Building with BUILD_TYPE: $BUILD_TYPE, BUILD_DIR: $BUILD_DIR" + +if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then + PYTHON_EXECUTABLE=python3 +fi + +# Number of processes for a parallel build +NPROC=8 +if hash nproc &> /dev/null; then NPROC=$(nproc); fi + +cmake_install_executorch_libraries() { + cmake -DPYTHON_EXECUTABLE=python \ + -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \ + -DEXECUTORCH_ENABLE_LOGGING=1 \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -B${BUILD_DIR} . + + cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE} +} + +cmake_build_phi_3_mini() { + cmake -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \ + -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -B${BUILD_DIR}/${MODEL_DIR} \ + ${MODEL_DIR} + + cmake --build ${BUILD_DIR}/${MODEL_DIR} -j${NPROC} --config ${BUILD_TYPE} +} + +# Download and convert tokenizer.model +prepare_tokenizer() { + echo "Downloading and converting tokenizer.model" + wget -O tokenizer.model "https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/resolve/main/tokenizer.model?download=true" + $PYTHON_EXECUTABLE -m executorch.extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin +} + +# Export phi-3-mini model to pte +export_phi_3_mini () { + echo "Exporting phi-3-mini. This will take a few minutes" + $PYTHON_EXECUTABLE -m executorch.examples.models.phi-3-mini.export_phi-3-mini -c "4k" -s 128 -o phi-3-mini.pte +} + +run_and_verify() { + NOW=$(date +"%H:%M:%S") + echo "Starting to run phi-3-mini runner at ${NOW}" + if [[ ! -f "phi-3-mini.pte" ]]; then + echo "Export failed. Abort" + exit 1 + fi + if [[ ! -f "tokenizer.bin" ]]; then + echo "tokenizer.bin is missing." + exit 1 + fi + + ${BUILD_DIR}/${MODEL_DIR}/phi_3_mini_runner \ + --model_path=phi-3-mini.pte \ + --tokenizer_path=tokenizer.bin \ + --seq_len=128 \ + --temperature=0 \ + --prompt="<|system|> +You are a helpful assistant.<|end|> +<|user|> +What is the capital of France?<|end|> +<|assistant|>" > result.txt + + # verify result.txt + RESULT=$(cat result.txt) + EXPECTED_RESULT="The capital of France is Paris." + if [[ "${RESULT}" == *"${EXPECTED_RESULT}"* ]]; then + echo "Expected result prefix: ${EXPECTED_RESULT}" + echo "Actual result: ${RESULT}" + echo "Success" + exit 0 + else + echo "Expected result prefix: ${EXPECTED_RESULT}" + echo "Actual result: ${RESULT}" + echo "Failure; results not the same" + exit 1 + fi +} + +# Step 1. Build ExecuTorch and phi-3-mini runner +cmake_install_executorch_libraries +cmake_build_phi_3_mini + +# Step 2. Export the tokenizer and model +prepare_tokenizer +export_phi_3_mini + +# Step 3. Run and verify result +run_and_verify diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh index ebc5361d00a..64c512cdccd 100644 --- a/.ci/scripts/utils.sh +++ b/.ci/scripts/utils.sh @@ -33,42 +33,6 @@ install_pip_dependencies() { popd || return } -install_domains() { - echo "Install torchvision and torchaudio" - pip install --no-use-pep517 --user "git+https://github.com/pytorch/audio.git@${TORCHAUDIO_VERSION}" - pip install --no-use-pep517 --user "git+https://github.com/pytorch/vision.git@${TORCHVISION_VERSION}" -} - -install_pytorch_and_domains() { - pushd .ci/docker || return - TORCH_VERSION=$(cat ci_commit_pins/pytorch.txt) - popd || return - - git clone https://github.com/pytorch/pytorch.git - - # Fetch the target commit - pushd pytorch || return - git checkout "${TORCH_VERSION}" - git submodule update --init --recursive - - export _GLIBCXX_USE_CXX11_ABI=0 - # Then build and install PyTorch - python setup.py bdist_wheel - pip install "$(echo dist/*.whl)" - - # Grab the pinned audio and vision commits from PyTorch - TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt) - export TORCHAUDIO_VERSION - TORCHVISION_VERSION=$(cat .github/ci_commit_pins/vision.txt) - export TORCHVISION_VERSION - - install_domains - - popd || return - # Print sccache stats for debugging - sccache --show-stats || true -} - install_flatc_from_source() { # NB: This function could be used to install flatbuffer from source pushd third-party/flatbuffers || return diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml index f684d83fa51..2b66829ed0a 100644 --- a/.github/pytorch-probot.yml +++ b/.github/pytorch-probot.yml @@ -1,5 +1,7 @@ # The schema is from https://github.com/pytorch/pytorch/blob/main/.github/pytorch-probot.yml ciflow_push_tags: +- ciflow/android +- ciflow/apple - ciflow/nightly - ciflow/trunk - ciflow/binaries diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index 78cd342c874..78c1a2dd096 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -15,7 +15,7 @@ on: description: Target devices to run benchmark required: false type: string - default: samsung_galaxy_s2x + default: samsung_galaxy_s22 delegates: description: Backend delegates required: false @@ -45,7 +45,7 @@ on: description: Target devices to run benchmark required: false type: string - default: samsung_galaxy_s2x + default: samsung_galaxy_s22 delegates: description: Backend delegates required: false @@ -84,9 +84,9 @@ jobs: # Separate default values from the workflow dispatch. To ensure defaults are accessible # during scheduled runs and to provide flexibility for different defaults between # on-demand and periodic benchmarking. - CRON_DEFAULT_MODELS: "stories110M" - CRON_DEFAULT_DEVICES: "samsung_galaxy_s2x" - CRON_DEFAULT_DELEGATES: "xnnpack" + CRON_DEFAULT_MODELS: "stories110M,dl3,mv3,mv2,ic4,ic3,vit" + CRON_DEFAULT_DEVICES: "samsung_galaxy_s22" + CRON_DEFAULT_DELEGATES: "xnnpack,qnn" run: | set -ex MODELS="${{ inputs.models }}" @@ -104,7 +104,8 @@ jobs: # Mapping devices to their corresponding device-pool-arn declare -A DEVICE_POOL_ARNS - DEVICE_POOL_ARNS[samsung_galaxy_s2x]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa" + DEVICE_POOL_ARNS[samsung_galaxy_s22]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa" + DEVICE_POOL_ARNS[samsung_galaxy_s24]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db" # Resolve device names with their corresponding ARNs if [[ ! $(echo "$DEVICES" | jq empty 2>/dev/null) ]]; then @@ -139,6 +140,7 @@ jobs: submodules: 'true' timeout: 60 upload-artifact: android-models + upload-artifact-to-s3: true script: | # The generic Linux job chooses to use base env, not the one setup by the image echo "::group::Setting up dev environment" @@ -156,54 +158,28 @@ jobs: BUILD_MODE="cmake" DTYPE="fp32" - if [[ ${{ matrix.model }} == "stories*"" ]]; then + if [[ ${{ matrix.model }} =~ ^stories* ]]; then # Install requirements for export_llama PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh # Test llama2 if [[ ${{ matrix.delegate }} == "xnnpack" ]]; then DELEGATE_CONFIG="xnnpack+custom+qe" + elif [[ ${{ matrix.delegate }} == "qnn" ]]; then + DELEGATE_CONFIG="qnn" + else + echo "Unsupported delegate ${{ matrix.delegate }}" + exit 1 fi PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh "${{ matrix.model }}" "${BUILD_MODE}" "${DTYPE}" "${DELEGATE_CONFIG}" "${ARTIFACTS_DIR_NAME}" else - PYTHON_EXECUTABLE=python bash .ci/scripts/test.sh "${{ matrix.model }}" "${BUILD_MODE}" "${{ matrix.delegate }}" "${ARTIFACTS_DIR_NAME}" + PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "${{ matrix.model }}" "${BUILD_MODE}" "${{ matrix.delegate }}" "${ARTIFACTS_DIR_NAME}" fi echo "::endgroup::" - # Upload models to S3. The artifacts are needed not only by the device farm but also TorchChat - upload-models: - needs: export-models - runs-on: linux.2xlarge - steps: - - name: Download the models from GitHub - uses: actions/download-artifact@v3 - with: - # The name here needs to match the name of the upload-artifact parameter - name: android-models - path: ${{ runner.temp }}/artifacts/ - - - name: Verify the models - shell: bash - working-directory: ${{ runner.temp }}/artifacts/ - run: | - ls -lah ./ - - - name: Upload the models to S3 - uses: seemethere/upload-artifact-s3@v5 - with: - s3-bucket: gha-artifacts - s3-prefix: | - ${{ github.repository }}/${{ github.run_id }}/artifact - retention-days: 1 - if-no-files-found: ignore - path: ${{ runner.temp }}/artifacts/ - build-llm-demo: name: build-llm-demo uses: pytorch/test-infra/.github/workflows/linux_job.yml@main needs: set-parameters - strategy: - matrix: - tokenizer: [bpe] with: runner: linux.2xlarge docker-image: executorch-ubuntu-22.04-clang12-android @@ -211,6 +187,7 @@ jobs: ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout: 90 upload-artifact: android-apps + upload-artifact-to-s3: true script: | set -eux @@ -220,37 +197,11 @@ jobs: PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded - # TODO: This needs to be replaced with a generic loader .apk - # Build LLM Demo for Android - bash build/build_android_llm_demo.sh ${{ matrix.tokenizer }} ${ARTIFACTS_DIR_NAME} - - # Upload artifacts to S3. The artifacts are needed not only by the device farm but also TorchChat - upload-android-apps: - needs: build-llm-demo - runs-on: linux.2xlarge - steps: - - name: Download the apps from GitHub - uses: actions/download-artifact@v3 - with: - # The name here needs to match the name of the upload-artifact parameter - name: android-apps - path: ${{ runner.temp }}/artifacts/ - - - name: Verify the apps - shell: bash - working-directory: ${{ runner.temp }}/artifacts/ - run: | - ls -lah ./ + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh + PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh - - name: Upload the apps to S3 - uses: seemethere/upload-artifact-s3@v5 - with: - s3-bucket: gha-artifacts - s3-prefix: | - ${{ github.repository }}/${{ github.run_id }}/artifact - retention-days: 14 - if-no-files-found: ignore - path: ${{ runner.temp }}/artifacts/ + export ANDROID_ABIS="arm64-v8a" + PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728 bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME} # Let's see how expensive this job is, we might want to tone it down by running it periodically benchmark-on-device: @@ -260,14 +211,17 @@ jobs: uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main needs: - set-parameters - - upload-models - - upload-android-apps + - build-llm-demo + - export-models strategy: matrix: model: ${{ fromJson(needs.set-parameters.outputs.models) }} delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }} device: ${{ fromJson(needs.set-parameters.outputs.devices) }} + fail-fast: false with: + # Due to scheduling a job may be pushed beyond the default 60m threshold + timeout: 120 device-type: android runner: linux.2xlarge test-infra-ref: '' @@ -278,10 +232,9 @@ jobs: # Unlike models there are limited numbers of build flavor for apps, and the model controls whether it should build with bpe/tiktoken tokenizer. # It's okay to build all possible apps with all possible flavors in job "build-llm-demo". However, in this job, once a model is given, there is only # one app+flavor that could load and run the model. - # TODO: Hard code llm_demo_bpe for now in this job. - android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo_bpe/app-debug.apk - android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo_bpe/app-debug-androidTest.apk + android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug.apk + android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug-androidTest.apk # NB: Need to set the default spec here so that it works for periodic too test-spec: ${{ inputs.test_spec || 'https://ossci-android.s3.amazonaws.com/executorch/android-llm-device-farm-test-spec.yml' }} # Uploaded to S3 from the previous job - extra-data: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/${{ matrix.model }}_${{ matrix.delegate }}/model.zip + extra-data: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}/model.zip diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml index e33b6e78334..54e9dbb7619 100644 --- a/.github/workflows/android.yml +++ b/.github/workflows/android.yml @@ -5,6 +5,8 @@ on: branches: - main - release/* + tags: + - ciflow/android/* pull_request: paths: - .ci/docker/** @@ -24,9 +26,6 @@ jobs: build-llm-demo: name: build-llm-demo uses: pytorch/test-infra/.github/workflows/linux_job.yml@main - strategy: - matrix: - tokenizer: [bpe, tiktoken] with: runner: linux.2xlarge docker-image: executorch-ubuntu-22.04-clang12-android @@ -34,6 +33,7 @@ jobs: ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout: 90 upload-artifact: android-apps + upload-artifact-to-s3: true script: | set -eux @@ -44,44 +44,13 @@ jobs: export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded # Build LLM Demo for Android - bash build/build_android_llm_demo.sh ${{ matrix.tokenizer }} ${ARTIFACTS_DIR_NAME} - - # Upload artifacts to S3. The artifacts are needed not only by the device farm but also TorchChat - upload-artifacts: - needs: build-llm-demo - runs-on: linux.2xlarge - steps: - - name: Download the artifacts from GitHub - uses: actions/download-artifact@v3 - with: - # The name here needs to match the name of the upload-artifact parameter - name: android-apps - path: ${{ runner.temp }}/artifacts/ - - - name: Verify the artifacts - shell: bash - working-directory: ${{ runner.temp }}/artifacts/ - run: | - ls -lah ./ - - - name: Upload the artifacts to S3 - uses: seemethere/upload-artifact-s3@v5 - with: - s3-bucket: gha-artifacts - s3-prefix: | - ${{ github.repository }}/${{ github.run_id }}/artifact - # NOTE: Consume stale artifacts won't make sense for benchmarking as the goal is always to - # benchmark models as fresh as possible. I'm okay to keep the 14 retention-days for now - # for TorchChat until we have a periodic job can publish it more often. Ideally I want to - # reduce it to <= 2 day, meaning the benchmark job will run daily. - retention-days: 14 - if-no-files-found: ignore - path: ${{ runner.temp }}/artifacts/ + bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME} # Running Android emulator directly on the runner and not using Docker run-emulator: needs: build-llm-demo - runs-on: amz2023.linux.4xlarge + # NB: Use metal install for KVM support to run the emulator faster + runs-on: linux.24xl.spr-metal env: ANDROID_NDK_VERSION: r26c API_LEVEL: 34 @@ -129,9 +98,6 @@ jobs: uses: reactivecircus/android-emulator-runner@v2 with: api-level: ${{ env.API_LEVEL }} - # NB: x86_64 emulator is slow because the lack of KVM support on AWS, it - # seems that we can use metal instance for that but it hasn't been tried - # out yet. Also arm64-v8a arch requires an ARM runner arch: x86_64 script: ./build/run_android_emulator.sh # NB: This is to boot the emulator faster following the instructions on @@ -144,36 +110,3 @@ jobs: emulator-options: -no-snapshot-save -no-window -gpu swiftshader_indirect -noaudio -no-boot-anim -camera-back none # This is to make sure that the job doesn't fail flakily emulator-boot-timeout: 900 - - # Let's see how expensive this job is, we might want to tone it down by running it periodically - test-llama-app: - # Only PR from ExecuTorch itself has permission to access AWS, forked PRs will fail to - # authenticate with the cloud service - if: ${{ !github.event.pull_request.head.repo.fork }} - needs: upload-artifacts - permissions: - id-token: write - contents: read - uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main - strategy: - matrix: - # https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/LlamaDemo/README.md#alternative-2-build-from-local-machine - # mentions that tiktoken is only for Llama3. So, we can export it later in another archive - # like https://ossci-assets.s3.amazonaws.com/executorch-android-llama2-7b-0717.zip when this is - # updated to run Llama3 - tokenizer: [bpe] - with: - device-type: android - runner: linux.2xlarge - test-infra-ref: '' - # This is the ARN of ExecuTorch project on AWS - project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6 - # This is the custom Android device pool that only includes Samsung Galaxy S2x - device-pool-arn: arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa - # Uploaded to S3 from the previous job, the name of the app comes from the project itself - android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo_${{ matrix.tokenizer }}/app-debug.apk - android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo_${{ matrix.tokenizer }}/app-debug-androidTest.apk - test-spec: https://ossci-android.s3.amazonaws.com/executorch/android-llm-device-farm-test-spec.yml - # Among the input, this is the biggest file, so it is cached on AWS to make the test faster. Note that the file is deleted by AWS after 30 - # days and the job will automatically re-upload the file when that happens. - extra-data: https://ossci-assets.s3.amazonaws.com/executorch-android-llama2-7b-0717.zip diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml new file mode 100644 index 00000000000..b4b1d3aef58 --- /dev/null +++ b/.github/workflows/apple-perf.yml @@ -0,0 +1,308 @@ +name: apple-perf + +on: + schedule: + - cron: 0 1 * * * + # Note: GitHub has an upper limit of 10 inputs + workflow_dispatch: + inputs: + models: + description: Models to be benchmarked + required: false + type: string + default: stories110M + devices: + description: Target devices to run benchmark + required: false + type: string + default: apple_iphone_15 + delegates: + description: Backend delegates + required: false + type: string + default: xnnpack + benchmark_configs: + description: The list of configs used the benchmark + required: false + type: string + test_spec: + description: The test spec to drive the test on AWS devices + required: false + type: string + workflow_call: + inputs: + models: + description: Models to be benchmarked + required: false + type: string + default: stories110M + devices: + description: Target devices to run benchmark + required: false + type: string + default: apple_iphone_15 + delegates: + description: Backend delegates + required: false + type: string + default: xnnpack + benchmark_configs: + description: The list of configs used the benchmark + required: false + type: string + test_spec: + description: The test spec to drive the test on AWS devices + required: false + type: string + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +jobs: + set-parameters: + runs-on: linux.2xlarge + outputs: + models: ${{ steps.set-parameters.outputs.models }} + devices: ${{ steps.set-parameters.outputs.devices }} + delegates: ${{ steps.set-parameters.outputs.delegates }} + steps: + - name: Set parameters + id: set-parameters + shell: bash + env: + # Separate default values from the workflow dispatch. To ensure defaults are accessible + # during scheduled runs and to provide flexibility for different defaults between + # on-demand and periodic benchmarking. + CRON_DEFAULT_MODELS: "stories110M,mv3,ic4,resnet50,edsr,mobilebert,w2l" + CRON_DEFAULT_DEVICES: "apple_iphone_15" + CRON_DEFAULT_DELEGATES: "xnnpack,coreml" + run: | + set -ex + MODELS="${{ inputs.models }}" + if [ -z "$MODELS" ]; then + MODELS="$CRON_DEFAULT_MODELS" + fi + DEVICES="${{ inputs.devices }}" + if [ -z "$DEVICES" ]; then + DEVICES="$CRON_DEFAULT_DEVICES" + fi + DELEGATES="${{ inputs.delegates }}" + if [ -z "$DELEGATES" ]; then + DELEGATES="$CRON_DEFAULT_DELEGATES" + fi + + # Mapping devices to their corresponding device-pool-arn + declare -A DEVICE_POOL_ARNS + DEVICE_POOL_ARNS[apple_iphone_15]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/3b5acd2e-92e2-4778-b651-7726bafe129d" + + # Resolve device names with their corresponding ARNs + if [[ ! $(echo "$DEVICES" | jq empty 2>/dev/null) ]]; then + DEVICES=$(echo "$DEVICES" | jq -Rc 'split(",")') + fi + declare -a MAPPED_ARNS=() + for DEVICE in $(echo "$DEVICES" | jq -r '.[]'); do + if [[ -z "${DEVICE_POOL_ARNS[$DEVICE]}" ]]; then + echo "Error: No ARN found for device '$DEVICE'. Abort." >&2 + exit 1 + fi + MAPPED_ARNS+=("${DEVICE_POOL_ARNS[$DEVICE]}") + done + + echo "models=$(echo $MODELS | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT + MAPPED_ARNS_JSON=$(printf '%s\n' "${MAPPED_ARNS[@]}" | jq -R . | jq -s .) + echo "devices=$(echo "$MAPPED_ARNS_JSON" | jq -c .)" >> $GITHUB_OUTPUT + echo "delegates=$(echo $DELEGATES | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT + + export-models: + name: export-models + uses: pytorch/test-infra/.github/workflows/macos_job.yml@main + needs: set-parameters + strategy: + matrix: + model: ${{ fromJson(needs.set-parameters.outputs.models) }} + delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }} + fail-fast: false + with: + # NB: Need to use our AWS MacOS runner to upload large models to S3 + runner: macos-m1-stable + python-version: '3.11' + submodules: 'true' + timeout: 60 + upload-artifact: ios-models + upload-artifact-to-s3: true + script: | + set -eux + + echo "::group::Setting up CI environment" + .ci/scripts/setup-conda.sh + + BUILD_TOOL=cmake + # Setup MacOS dependencies as there is no Docker support on MacOS atm + GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + .ci/scripts/setup-macos.sh "${BUILD_TOOL}" + + if [[ ${{ matrix.delegate }} == "coreml" ]]; then + PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + backends/apple/coreml/scripts/install_requirements.sh + fi + + if [[ ${{ matrix.delegate }} == "mps" ]]; then + PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + backends/apple/mps/install_requirements.sh + fi + + ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.delegate }} + echo "::endgroup::" + + echo "::group::Exporting ${{ matrix.delegate }} model: ${{ matrix.model }}" + BUILD_MODE="cmake" + DTYPE="fp32" + + if [[ ${{ matrix.model }} =~ ^stories* ]]; then + # Install requirements for export_llama + PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + bash examples/models/llama2/install_requirements.sh + + # Test llama2 + if [[ ${{ matrix.delegate }} == "xnnpack" ]]; then + DELEGATE_CONFIG="xnnpack+custom+qe" + elif [[ ${{ matrix.delegate }} == "coreml" ]]; then + DELEGATE_CONFIG="coreml" + fi + PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + bash .ci/scripts/test_llama.sh "${{ matrix.model }}" "${BUILD_MODE}" "${DTYPE}" "${DELEGATE_CONFIG}" "${ARTIFACTS_DIR_NAME}" + else + PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + bash .ci/scripts/test_model.sh "${{ matrix.model }}" "${BUILD_MODE}" "${{ matrix.delegate }}" "${ARTIFACTS_DIR_NAME}" + fi + echo "::endgroup::" + + build-benchmark-app: + name: build-benchmark-app + uses: pytorch/test-infra/.github/workflows/macos_job.yml@main + needs: + - set-parameters + secrets: inherit + with: + runner: macos-latest-xlarge + python-version: '3.11' + submodules: 'true' + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + upload-artifact: ios-apps + secrets-env: BUILD_CERTIFICATE_BASE64 EXECUTORCH_BENCHMARK_BUILD_PROVISION_PROFILE_BASE64 KEYCHAIN_PASSWORD + timeout: 90 + script: | + set -eux + + echo "::group::Setting up CI environment" + .ci/scripts/setup-conda.sh + + BUILD_TOOL=cmake + # Setup MacOS dependencies as there is no Docker support on MacOS atm + GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + .ci/scripts/setup-macos.sh "${BUILD_TOOL}" + export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded + + # Setup Apple certificate for iOS development + BUILD_PROVISION_PROFILE_BASE64="${SECRET_EXECUTORCH_BENCHMARK_BUILD_PROVISION_PROFILE_BASE64}" \ + BUILD_CERTIFICATE_BASE64="${SECRET_BUILD_CERTIFICATE_BASE64}" \ + KEYCHAIN_PASSWORD="${SECRET_KEYCHAIN_PASSWORD}" \ + .ci/scripts/setup-ios.sh + + # Install CoreML Backend Requirements + PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + backends/apple/coreml/scripts/install_requirements.sh + + # Install MPS Backend Requirements + PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + backends/apple/mps/install_requirements.sh + echo "::endgroup::" + + echo "::group::Build ExecuTorch iOS frameworks" + FRAMEWORKS=( + "executorch" + "backend_coreml" + "backend_mps" + "backend_xnnpack" + "kernels_custom" + "kernels_optimized" + "kernels_portable" + "kernels_quantized" + ) + + # Build Release iOS Frameworks + PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ + build/build_apple_frameworks.sh --coreml --custom --mps --optimized --portable --quantized --xnnpack + + mkdir -p extension/apple/Benchmark/Frameworks + for FRAMEWORK in "${FRAMEWORKS[@]}"; do ( + cp -r "cmake-out/${FRAMEWORK}.xcframework" extension/apple/Benchmark/Frameworks/ + ) done + echo "::endgroup::" + + # NB: Although exported models can be copied to this directory and bundled together with the + # app, we don't use this in CI and rely on AWS extra data parameter to make the model and the + # tokenizer available to the benchmark. This decouples the app and the model. We just need to + # create the directory here to pass the build + mkdir -p extension/apple/Benchmark/Models + ${CONDA_RUN} --no-capture-output \ + build/build_apple_llm_demo.sh ${ARTIFACTS_DIR_NAME} + + upload-benchmark-app: + needs: build-benchmark-app + runs-on: linux.2xlarge + steps: + - name: Download the apps from GitHub + uses: actions/download-artifact@v3 + with: + # The name here needs to match the name of the upload-artifact parameter + name: ios-apps + path: ${{ runner.temp }}/artifacts/ + + - name: Verify the apps + shell: bash + working-directory: ${{ runner.temp }}/artifacts/ + run: | + ls -lah ./ + + - name: Upload the apps to S3 + uses: seemethere/upload-artifact-s3@v5 + with: + s3-bucket: gha-artifacts + s3-prefix: | + ${{ github.repository }}/${{ github.run_id }}/artifacts + retention-days: 14 + if-no-files-found: ignore + path: ${{ runner.temp }}/artifacts/ + + benchmark-on-device: + needs: + - set-parameters + - upload-benchmark-app + - export-models + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main + strategy: + matrix: + model: ${{ fromJson(needs.set-parameters.outputs.models) }} + delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }} + device: ${{ fromJson(needs.set-parameters.outputs.devices) }} + fail-fast: false + with: + # Due to scheduling a job may be pushed beyond the default 60m threshold + timeout: 120 + device-type: ios + # For iOS testing, the runner just needs to call AWS Device Farm, so there is no need to run this on macOS + runner: linux.2xlarge + test-infra-ref: '' + # This is the ARN of ExecuTorch project on AWS + project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6 + device-pool-arn: ${{ matrix.device }} + # Uploaded to S3 from the previous job + ios-ipa-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/Benchmark.ipa + ios-xctestrun-zip: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/Benchmark.xctestrun.zip + test-spec: ${{ inputs.test_spec || 'https://ossci-ios.s3.amazonaws.com/executorch/default-ios-device-farm-appium-test-spec.yml' }} + extra-data: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}/model.zip diff --git a/.github/workflows/apple.yml b/.github/workflows/apple.yml index 60022b81f9e..229d8e5abf6 100644 --- a/.github/workflows/apple.yml +++ b/.github/workflows/apple.yml @@ -8,13 +8,14 @@ on: pull_request: paths: - .ci/docker/** + - .ci/scripts/setup-ios.sh - .github/workflows/apple.yml - install_requirements.sh - backends/apple/** - build/build_apple_frameworks.sh - build/create_frameworks.sh - build/test_ios_ci.sh - - examples/demo-apps/** + - examples/demo-apps/apple_ios/** - extension/apple/** - extension/module/** workflow_dispatch: @@ -24,27 +25,89 @@ concurrency: cancel-in-progress: true jobs: - test-demo-ios: - name: test-demo-ios + build-demo-ios: + name: build-demo-ios uses: pytorch/test-infra/.github/workflows/macos_job.yml@main + secrets: inherit with: runner: macos-latest-xlarge python-version: '3.11' submodules: 'true' ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout: 90 + secrets-env: BUILD_CERTIFICATE_BASE64 EXECUTORCH_DEMO_BUILD_PROVISION_PROFILE_BASE64 KEYCHAIN_PASSWORD + upload-artifact: ios-apps script: | BUILD_TOOL=cmake .ci/scripts/setup-conda.sh + # Setup Apple certificate for iOS development + BUILD_PROVISION_PROFILE_BASE64="${SECRET_EXECUTORCH_DEMO_BUILD_PROVISION_PROFILE_BASE64}" \ + BUILD_CERTIFICATE_BASE64="${SECRET_BUILD_CERTIFICATE_BASE64}" \ + KEYCHAIN_PASSWORD="${SECRET_KEYCHAIN_PASSWORD}" \ + .ci/scripts/setup-ios.sh + # Setup MacOS dependencies as there is no Docker support on MacOS atm GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ .ci/scripts/setup-macos.sh "${BUILD_TOOL}" + export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded + # Build and test iOS Demo App PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ - build/test_ios_ci.sh + build/test_ios_ci.sh ${ARTIFACTS_DIR_NAME} + + # Upload the test demo app to S3 + upload-demo-ios: + needs: build-demo-ios + runs-on: linux.2xlarge + steps: + - name: Download the artifacts from GitHub + uses: actions/download-artifact@v3 + with: + # The name here needs to match the name of the upload-artifact parameter + name: ios-apps + path: ${{ runner.temp }}/artifacts/ + + - name: Verify the artifacts + shell: bash + working-directory: ${{ runner.temp }}/artifacts/ + run: | + ls -lah ./ + + - name: Upload the artifacts to S3 + uses: seemethere/upload-artifact-s3@v5 + with: + s3-bucket: gha-artifacts + s3-prefix: | + ${{ github.repository }}/${{ github.run_id }}/artifact + retention-days: 14 + if-no-files-found: ignore + path: ${{ runner.temp }}/artifacts/ + + test-demo-ios: + # Only PR from ExecuTorch itself has permission to access AWS, forked PRs will fail to + # authenticate with the cloud service. So, this job will be skipped on the latter + if: ${{ !github.event.pull_request.head.repo.fork }} + needs: upload-demo-ios + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main + with: + device-type: ios + # For iOS testing, the runner just needs to call AWS Device Farm, so there is no need to run this on macOS + runner: linux.2xlarge + test-infra-ref: '' + # This is the ARN of ExecuTorch project on AWS + project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6 + # This is the custom device pool that only includes iOS devices + device-pool-arn: arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/3b5acd2e-92e2-4778-b651-7726bafe129d + # Uploaded to S3 from the previous job + ios-ipa-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/ExecuTorchDemo.ipa + ios-xctestrun-zip: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/ExecuTorchDemo.xctestrun.zip + test-spec: https://ossci-ios.s3.amazonaws.com/executorch/default-ios-device-farm-appium-test-spec.yml build-frameworks-ios: name: build-frameworks-ios diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 7cb2cf69b8b..56b70409d79 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -54,3 +54,25 @@ jobs: lint.json || true exit $RC + + android-java-format: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + with: + runner: linux.2xlarge + docker-image: executorch-ubuntu-22.04-linter + fetch-depth: 0 + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | + FILES_NEEDS_FORMAT=$(/opt/google-java-format -n extension/android/src/main/java/org/pytorch/executorch/*.java \ + examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/*.java \ + examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/*.java \ + extension/android/benchmark/app/src/main/java/org/pytorch/minibench/*.java) + if [ -n "$FILES_NEEDS_FORMAT" ]; then + echo "Warning: The following files need formatting. Please use google-java-format." + echo "Use a binary from https://github.com/google/google-java-format/releases/" + echo "For example:" + echo "wget https://github.com/google/google-java-format/releases/download/v1.23.0/google-java-format_linux-x86-64" + echo "chmod +x google-java-format_linux-x86-64" + echo "./google-java-format_linux-x86-64 -i $FILES_NEEDS_FORMAT" + exit 1 + fi diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml index 4cc57b0c7f1..df13140ca92 100644 --- a/.github/workflows/periodic.yml +++ b/.github/workflows/periodic.yml @@ -62,4 +62,4 @@ jobs: PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}" # Build and test ExecuTorch - PYTHON_EXECUTABLE=python bash .ci/scripts/test.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}" + PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}" diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 3e346c716e7..f7d2b627bc5 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -54,7 +54,7 @@ jobs: PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}" # Build and test ExecuTorch with the add model on portable backend. - PYTHON_EXECUTABLE=python bash .ci/scripts/test.sh "add" "${BUILD_TOOL}" "portable" + PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "add" "${BUILD_TOOL}" "portable" test-models-linux: name: test-models-linux @@ -81,7 +81,7 @@ jobs: PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}" # Build and test ExecuTorch - PYTHON_EXECUTABLE=python bash .ci/scripts/test.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}" + PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}" test-llama-runner-linux: name: test-llama-runner-linux @@ -91,6 +91,13 @@ jobs: dtype: [fp32] build-tool: [buck2, cmake] mode: [portable, xnnpack+custom, xnnpack+custom+qe] + include: + - dtype: bf16 + build-tool: cmake + mode: portable + - dtype: bf16 + build-tool: buck2 + mode: portable fail-fast: false with: runner: linux.2xlarge @@ -407,3 +414,30 @@ jobs: PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh # Test llama2 PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}" + + test-phi-3-mini-runner-linux: + name: test-phi-3-mini-runner-linux + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + strategy: + fail-fast: false + with: + runner: linux.24xlarge + docker-image: executorch-ubuntu-22.04-clang12 + submodules: 'true' + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + timeout: 90 + script: | + # The generic Linux job chooses to use base env, not the one setup by the image + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + conda activate "${CONDA_ENV}" + + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake" + + # install pybind + bash install_requirements.sh --pybind xnnpack + + # install phi-3-mini requirements + bash examples/models/phi-3-mini/install_requirements.sh + + # run e2e (export, tokenizer and runner) + PYTHON_EXECUTABLE=python bash .ci/scripts/test_phi_3_mini.sh diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 98d14824638..d7130561fa6 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -59,7 +59,7 @@ jobs: # Setup MacOS dependencies as there is no Docker support on MacOS atm PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}" # Build and test xecutorch - PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}" + PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}" test-custom-ops-macos: name: test-custom-ops-macos @@ -143,7 +143,6 @@ jobs: conda activate "${CONDA_ENV}" source .ci/scripts/utils.sh - install_flatc_from_source install_executorch install_arm @@ -169,7 +168,6 @@ jobs: conda activate "${CONDA_ENV}" source .ci/scripts/utils.sh - install_flatc_from_source install_executorch install_arm @@ -225,8 +223,10 @@ jobs: strategy: matrix: dtype: [fp32] - build-tool: [buck2, cmake] mode: [portable, xnnpack+kv+custom, mps, coreml] + include: + - dtype: bf16 + mode: portable fail-fast: false with: runner: macos-m1-stable @@ -237,25 +237,12 @@ jobs: script: | DTYPE=${{ matrix.dtype }} - BUILD_TOOL=${{ matrix.build-tool }} MODE=${{ matrix.mode }} - if [[ "${BUILD_TOOL}" == "buck2" ]]; then - # TODO: Will add more modes that don't support buck2 - if [[ "${MODE}" == "mps" ]]; then - echo "mps doesn't support buck2." - exit 0 - fi - if [[ "${MODE}" == "coreml" ]]; then - echo "coreml doesn't support buck2." - exit 0 - fi - fi - bash .ci/scripts/setup-conda.sh # Setup executorch - PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}" + PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh cmake if [[ "${MODE}" == "mps" ]]; then # Install mps delegate @@ -270,7 +257,36 @@ jobs: # Install requirements for export_llama PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama2/install_requirements.sh # Test llama2 - PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}" + PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh stories110M cmake "${DTYPE}" "${MODE}" + + # # TODO(jackzhxng): Runner consistently runs out of memory before test finishes. Try to find a more powerful runner. + # test-llava-runner-macos: + # name: test-llava-runner-macos + # uses: pytorch/test-infra/.github/workflows/macos_job.yml@main + # strategy: + # fail-fast: false + # with: + # runner: macos-14-xlarge + # python-version: '3.11' + # submodules: 'true' + # ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # timeout: 900 + # script: | + # BUILD_TOOL=cmake + + # bash .ci/scripts/setup-conda.sh + # # Setup MacOS dependencies as there is no Docker support on MacOS atm + # GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}" + + # # install Llava requirements + # ${CONDA_RUN} bash examples/models/llama2/install_requirements.sh + # ${CONDA_RUN} bash examples/models/llava/install_requirements.sh + + # # run python unittest + # ${CONDA_RUN} python -m unittest examples.models.llava.test.test_llava + + # # run e2e (export, tokenizer and runner) + # PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llava.sh Release test-qnn-model: name: test-qnn-model @@ -278,7 +294,7 @@ jobs: strategy: matrix: dtype: [fp32] - model: [dl3] + model: [dl3, mv3, mv2, ic4, ic3, vit] fail-fast: false with: runner: linux.2xlarge @@ -293,4 +309,128 @@ jobs: PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh - PYTHON_EXECUTABLE=python bash .ci/scripts/test.sh ${{ matrix.model }} "cmake" "qnn" + PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh ${{ matrix.model }} "cmake" "qnn" + + test-coreml-model: + name: test-coreml-model + uses: pytorch/test-infra/.github/workflows/macos_job.yml@main + strategy: + fail-fast: false + with: + runner: macos-m1-stable + python-version: '3.11' + submodules: 'true' + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + timeout: 90 + script: | + BUILD_TOOL=cmake + BACKEND=coreml + + bash .ci/scripts/setup-conda.sh + + # Setup MacOS dependencies as there is no Docker support on MacOS atm + PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}" + PYTHON_EXECUTABLE=python ${CONDA_RUN} bash backends/apple/coreml/scripts/install_requirements.sh + echo "Finishing installing coreml." + + # Build and test coreml model + MODELS=(mv3 ic4 resnet50 edsr mobilebert w2l) + for MODEL_NAME in "${MODELS[@]}"; do + echo "::group::Exporting coreml model: $MODEL_NAME" + PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" + echo "::endgroup::" + done + + test-huggingface-transformers: + name: test-huggingface-transformers + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + secrets: inherit + strategy: + matrix: + hf_model_repo: [google/gemma-2b] + fail-fast: false + with: + secrets-env: EXECUTORCH_HF_TOKEN + runner: linux.12xlarge + docker-image: executorch-ubuntu-22.04-clang12 + submodules: 'true' + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + timeout: 90 + script: | + echo "::group::Set up ExecuTorch" + # The generic Linux job chooses to use base env, not the one setup by the image + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + conda activate "${CONDA_ENV}" + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake + + echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a" + rm -rf cmake-out + cmake \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DPYTHON_EXECUTABLE=python \ + -Bcmake-out . + cmake --build cmake-out -j9 --target install --config Release + + echo "Build llama runner" + dir="examples/models/llama2" + cmake \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DPYTHON_EXECUTABLE=python \ + -Bcmake-out/${dir} \ + ${dir} + cmake --build cmake-out/${dir} -j9 --config Release + echo "::endgroup::" + + echo "::group::Set up HuggingFace Dependencies" + if [ -z "$SECRET_EXECUTORCH_HF_TOKEN" ]; then + echo "::error::SECRET_EXECUTORCH_HF_TOKEN is empty. For security reason secrets won't be accessible on forked PRs. Please make sure you submit a non-forked PR." + exit 1 + fi + pip install -U "huggingface_hub[cli]" + huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN + pip install accelerate sentencepiece + # TODO(guangyang): Switch to use released transformers library after all required patches are included + pip install "git+https://github.com/huggingface/transformers.git@6cc4dfe3f1e8d421c6d6351388e06e9b123cbfe1" + pip list + echo "::endgroup::" + + echo "::group::Export to ExecuTorch" + TOKENIZER_FILE=tokenizer.model + TOKENIZER_BIN_FILE=tokenizer.bin + ET_MODEL_NAME=et_model + # Fetch the file using a Python one-liner + DOWNLOADED_TOKENIZER_FILE_PATH=$(python -c " + from huggingface_hub import hf_hub_download + # Download the file from the Hugging Face Hub + downloaded_path = hf_hub_download( + repo_id='${{ matrix.hf_model_repo }}', + filename='${TOKENIZER_FILE}' + ) + print(downloaded_path) + ") + if [ -f "$DOWNLOADED_TOKENIZER_FILE_PATH" ]; then + echo "${TOKENIZER_FILE} downloaded successfully at: $DOWNLOADED_TOKENIZER_FILE_PATH" + python -m extension.llm.tokenizer.tokenizer -t $DOWNLOADED_TOKENIZER_FILE_PATH -o ./${TOKENIZER_BIN_FILE} + ls ./tokenizer.bin + else + echo "Failed to download ${TOKENIZER_FILE} from ${{ matrix.hf_model_repo }}." + exit 1 + fi + + python -m extension.export_util.export_hf_model -hfm=${{ matrix.hf_model_repo }} -o ${ET_MODEL_NAME} + + cmake-out/examples/models/llama2/llama_main --model_path=${ET_MODEL_NAME}.pte --tokenizer_path=${TOKENIZER_BIN_FILE} --prompt="My name is" + echo "::endgroup::" diff --git a/.github/workflows/upload-test-specs.yml b/.github/workflows/upload-android-test-specs.yml similarity index 70% rename from .github/workflows/upload-test-specs.yml rename to .github/workflows/upload-android-test-specs.yml index 24119b64566..e9b1054080c 100644 --- a/.github/workflows/upload-test-specs.yml +++ b/.github/workflows/upload-android-test-specs.yml @@ -1,19 +1,21 @@ -name: Upload AWS Device Farm test specs +name: Upload AWS Device Farm Android test specs on: pull_request: paths: - - .github/workflows/upload-test-specs.yml - - examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml + - .github/workflows/upload-android-test-specs.yml + - extension/android/benchmark/android-llm-device-farm-test-spec.yml push: branches: - main paths: - - .github/workflows/upload-test-specs.yml - - examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml + - .github/workflows/upload-android-test-specs.yml + - extension/android/benchmark/android-llm-device-farm-test-spec.yml concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + # NB: This concurency group needs to be different than the one used in android-perf, otherwise + # GH complains about concurrency deadlock + group: android-spec-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} cancel-in-progress: true jobs: @@ -27,10 +29,10 @@ jobs: with: s3-bucket: gha-artifacts s3-prefix: | - ${{ github.repository }}/${{ github.run_id }}/artifact + ${{ github.repository }}/${{ github.run_id }}/artifacts retention-days: 1 if-no-files-found: error - path: examples/demo-apps/android/LlamaDemo/android-llm-device-farm-test-spec.yml + path: extension/android/benchmark/android-llm-device-farm-test-spec.yml validate-android-test-spec: needs: upload-android-test-spec-for-validation @@ -41,9 +43,9 @@ jobs: with: # Just use a small model here with a minimal amount of configuration to test the spec models: stories110M - devices: samsung_galaxy_s2x + devices: samsung_galaxy_s22 delegates: xnnpack - test_spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/android-llm-device-farm-test-spec.yml + test_spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/android-llm-device-farm-test-spec.yml upload-android-test-spec: needs: validate-android-test-spec @@ -75,7 +77,7 @@ jobs: - name: Upload the spec to S3 ossci-android bucket shell: bash - working-directory: examples/demo-apps/android/LlamaDemo/ + working-directory: extension/android/benchmark/ env: SPEC_FILE: android-llm-device-farm-test-spec.yml run: | diff --git a/.github/workflows/upload-apple-test-specs.yml b/.github/workflows/upload-apple-test-specs.yml new file mode 100644 index 00000000000..06d20ef2beb --- /dev/null +++ b/.github/workflows/upload-apple-test-specs.yml @@ -0,0 +1,95 @@ +name: Upload AWS Device Farm Apple iOS test specs + +on: + pull_request: + paths: + - .github/workflows/upload-apple-test-specs.yml + - examples/demo-apps/apple_ios/default-ios-device-farm-appium-test-spec.yml + push: + branches: + - main + paths: + - .github/workflows/upload-apple-test-specs.yml + - examples/demo-apps/apple_ios/default-ios-device-farm-appium-test-spec.yml + +concurrency: + # NB: This concurency group needs to be different than the one used in apple-perf, otherwise + # GH complains about concurrency deadlock + group: apple-spec-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +jobs: + upload-apple-test-spec-for-validation: + runs-on: linux.2xlarge + steps: + - uses: actions/checkout@v3 + + - name: Upload the spec as a GitHub artifact for validation + uses: seemethere/upload-artifact-s3@v5 + with: + s3-bucket: gha-artifacts + s3-prefix: | + ${{ github.repository }}/${{ github.run_id }}/artifacts + retention-days: 1 + if-no-files-found: error + path: examples/demo-apps/apple_ios/default-ios-device-farm-appium-test-spec.yml + + validate-apple-test-spec: + needs: upload-apple-test-spec-for-validation + uses: ./.github/workflows/apple-perf.yml + secrets: inherit + permissions: + id-token: write + contents: read + with: + # Just use a small model here with a minimal amount of configuration to test the spec + models: stories110M + devices: apple_iphone_15 + delegates: xnnpack + test_spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/default-ios-device-farm-appium-test-spec.yml + + upload-apple-test-spec: + needs: validate-apple-test-spec + runs-on: ubuntu-22.04 + timeout-minutes: 15 + permissions: + id-token: write + contents: read + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v4 + with: + python-version: '3.11' + cache: pip + + - name: configure aws credentials + uses: aws-actions/configure-aws-credentials@v1.7.0 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_executorch_upload-frameworks-ios + aws-region: us-east-1 + + - name: Only push to S3 when running the workflow manually from main branch + if: ${{ github.ref == 'refs/heads/main' }} + shell: bash + run: | + set -eux + echo "UPLOAD_ON_MAIN=1" >> "${GITHUB_ENV}" + + - name: Upload the spec to S3 ossci-ios bucket + shell: bash + working-directory: examples/demo-apps/apple_ios + env: + SPEC_FILE: default-ios-device-farm-appium-test-spec.yml + run: | + set -eux + + pip install awscli==1.32.18 + + AWS_CMD="aws s3 cp --dryrun" + if [[ "${UPLOAD_ON_MAIN:-0}" == "1" ]]; then + AWS_CMD="aws s3 cp" + fi + + shasum -a 256 "${SPEC_FILE}" + ${AWS_CMD} "${SPEC_FILE}" s3://ossci-ios/executorch/ --acl public-read diff --git a/.gitmodules b/.gitmodules index 0999bdb9356..71ff854bb03 100644 --- a/.gitmodules +++ b/.gitmodules @@ -21,7 +21,7 @@ url = https://github.com/Maratyszcza/FXdiv.git [submodule "backends/xnnpack/third-party/XNNPACK"] path = backends/xnnpack/third-party/XNNPACK - url = https://github.com/digantdesai/XNNPACK.git + url = https://github.com/google/XNNPACK.git [submodule "backends/xnnpack/third-party/cpuinfo"] path = backends/xnnpack/third-party/cpuinfo url = https://github.com/pytorch/cpuinfo.git diff --git a/.lintrunner.toml b/.lintrunner.toml index c28512c5986..7aa15d65638 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -74,6 +74,9 @@ exclude_patterns = [ # NB: Objective-C is not supported 'examples/apple/**', 'examples/demo-apps/apple_ios/**', + # File contains @generated + 'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h', + 'extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h', ] command = [ 'python', @@ -177,6 +180,9 @@ exclude_patterns = [ '**/*.bat', '**/*.jpg', '**/*.jar', + # File contains @generated + 'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h', + 'extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h', ] command = [ 'python', diff --git a/CMakeLists.txt b/CMakeLists.txt index afb0437fae4..288bc9018ad 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -181,6 +181,10 @@ option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL "Build the Runner Util extension" OFF ) +option(EXECUTORCH_BUILD_EXTENSION_TENSOR "Build the Tensor extension" OFF) + +option(EXECUTORCH_BUILD_EXTENSION_TRAINING "Build the training extension" OFF) + option(EXECUTORCH_BUILD_GTESTS "Build googletest based test binaries" OFF) option(EXECUTORCH_BUILD_MPS "Build the MPS backend" OFF) @@ -195,7 +199,7 @@ option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "Build the optimized kernels" OFF) option(EXECUTORCH_BUILD_KERNELS_QUANTIZED "Build the quantized kernels" OFF) -option(EXECUTORCH_BUILD_SDK "Build the ExecuTorch SDK") +option(EXECUTORCH_BUILD_DEVTOOLS "Build the ExecuTorch Developer Tools") option(EXECUTORCH_BUILD_SIZE_TEST "Build the size test" OFF) @@ -226,6 +230,7 @@ cmake_dependent_option( ) if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT) + set(EXECUTORCH_BUILD_EXTENSION_TENSOR ON) set(EXECUTORCH_BUILD_KERNELS_CUSTOM ON) endif() @@ -505,7 +510,8 @@ if(EXECUTORCH_BUILD_PYBIND AND APPLE) ) target_link_libraries(executorch_no_prim_ops_shared PRIVATE program_schema) if(DL_LIBRARY_EXISTS) - target_link_libraries(executorch_no_prim_ops_shared PRIVATE dl) # For dladdr() + # For dladdr() + target_link_libraries(executorch_no_prim_ops_shared PRIVATE dl) endif() target_include_directories( executorch_no_prim_ops_shared PUBLIC ${_common_include_directories} @@ -541,17 +547,13 @@ target_link_options_shared_lib(executorch) # operators necessary for the models that will run. # if(BUILD_EXECUTORCH_PORTABLE_OPS) -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable) endif() if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/optimized) endif() -if(EXECUTORCH_BUILD_KERNELS_QUANTIZED) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/quantized) -endif() - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/configurations) # @@ -582,90 +584,77 @@ cmake_dependent_option( EXECUTORCH_BUILD_EXECUTOR_RUNNER "Build the executor_runner executable" ON EXECUTORCH_BUILD_HOST_TARGETS OFF ) -if(EXECUTORCH_BUILD_EXECUTOR_RUNNER) - # Baseline libraries that executor_runner will link against. - set(_executor_runner_libs executorch gflags) - - if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) - list(APPEND _executor_runner_libs optimized_native_cpu_ops_lib) - elseif(EXECUTORCH_BUILD_CADENCE) - list(APPEND _executor_runner_libs cadence_ops_lib) - else() - list(APPEND _executor_runner_libs portable_ops_lib) - endif() - - # Generate lib to register quantized ops - if(EXECUTORCH_BUILD_KERNELS_QUANTIZED) - list(APPEND _executor_runner_libs quantized_ops_lib) - endif() - - add_executable(executor_runner ${_executor_runner__srcs}) - if(CMAKE_BUILD_TYPE STREQUAL "Release" AND NOT APPLE) - target_link_options(executor_runner PRIVATE "LINKER:--gc-sections") - endif() - target_link_libraries(executor_runner ${_executor_runner_libs}) - target_compile_options(executor_runner PUBLIC ${_common_compile_options}) -endif() # Add googletest if any test targets should be built if(EXECUTORCH_BUILD_GTESTS) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/googletest) endif() -if(EXECUTORCH_BUILD_SDK) - set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER - ON - CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE - ) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/sdk) +if(EXECUTORCH_BUILD_ARM_BAREMETAL) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm) endif() -if(EXECUTORCH_BUILD_EXTENSION_APPLE) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/apple) +if(EXECUTORCH_BUILD_CADENCE) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cadence) endif() -if(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/data_loader) +if(EXECUTORCH_BUILD_COREML) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/coreml) endif() -if(EXECUTORCH_BUILD_EXTENSION_MODULE) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module) +if(EXECUTORCH_BUILD_MPS) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/mps) endif() if(EXECUTORCH_BUILD_NEURON) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/mediatek) endif() -if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/runner_util) +if(EXECUTORCH_BUILD_QNN) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/qualcomm) endif() if(EXECUTORCH_BUILD_XNNPACK) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/xnnpack) endif() -if(EXECUTORCH_BUILD_VULKAN) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/vulkan) +if(EXECUTORCH_BUILD_DEVTOOLS) + set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER + ON + CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE + ) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools) endif() -if(EXECUTORCH_BUILD_QNN) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/qualcomm) +if(EXECUTORCH_BUILD_EXTENSION_APPLE) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/apple) endif() -if(EXECUTORCH_BUILD_ARM_BAREMETAL) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm) +if(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/data_loader) endif() -if(EXECUTORCH_BUILD_MPS) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/mps) +if(EXECUTORCH_BUILD_EXTENSION_MODULE) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module) endif() -if(EXECUTORCH_BUILD_COREML) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/coreml) +if(EXECUTORCH_BUILD_EXTENSION_TRAINING) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/training) endif() -if(EXECUTORCH_BUILD_CADENCE) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cadence) +if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/runner_util) +endif() + +if(EXECUTORCH_BUILD_EXTENSION_TENSOR) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/tensor) +endif() + +if(EXECUTORCH_BUILD_PTHREADPOOL + AND EXECUTORCH_BUILD_CPUINFO + AND CMAKE_CXX_STANDARD GREATER_EQUAL 14 +) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/threadpool) endif() if(EXECUTORCH_BUILD_PYBIND) @@ -675,8 +664,8 @@ if(EXECUTORCH_BUILD_PYBIND) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/data_loader) endif() - if(NOT EXECUTORCH_BUILD_SDK) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/sdk) + if(NOT EXECUTORCH_BUILD_DEVTOOLS) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools) endif() # find pytorch lib, to allow pybind to take at::Tensor as input/output @@ -691,11 +680,16 @@ if(EXECUTORCH_BUILD_PYBIND) etdump executorch extension_data_loader - portable_ops_lib util torch ) + if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) + list(APPEND _dep_libs optimized_native_cpu_ops_lib) + else() + list(APPEND _dep_libs portable_ops_lib) + endif() + if(EXECUTORCH_BUILD_COREML) list(APPEND _dep_libs coremldelegate) endif() @@ -710,10 +704,6 @@ if(EXECUTORCH_BUILD_PYBIND) list(APPEND _dep_libs xnnpack_backend XNNPACK) endif() - if(EXECUTORCH_BUILD_KERNELS_QUANTIZED) - target_link_options_shared_lib(quantized_ops_lib) - endif() - # compile options for pybind set(_pybind_compile_options -Wno-deprecated-declarations @@ -726,10 +716,8 @@ if(EXECUTORCH_BUILD_PYBIND) ) # util lib add_library( - util - ${CMAKE_CURRENT_SOURCE_DIR}/extension/evalue_util/print_evalue.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/extension/aten_util/aten_bridge.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/util/read_file.cpp + util ${CMAKE_CURRENT_SOURCE_DIR}/extension/evalue_util/print_evalue.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/extension/aten_util/aten_bridge.cpp ) target_include_directories( util PUBLIC ${_common_include_directories} ${TORCH_INCLUDE_DIRS} @@ -778,12 +766,14 @@ if(EXECUTORCH_BUILD_PYBIND) else() set_target_properties( portable_lib - PROPERTIES # Assume is the root `site-packages/executorch` - # Need to add /extension/llm/custom_ops for - # libcustom_ops_aot_lib - # Need to add /kernels/quantized for - # libquantized_ops_aot_lib - BUILD_RPATH "$ORIGIN:$ORIGIN/../../extension/llm/custom_ops:$ORIGIN/../../kernels/quantized" + PROPERTIES + # Assume is the root `site-packages/executorch` + # Need to add /extension/llm/custom_ops for + # libcustom_ops_aot_lib + # Need to add /kernels/quantized for + # libquantized_ops_aot_lib + BUILD_RPATH + "$ORIGIN:$ORIGIN/../../extension/llm/custom_ops:$ORIGIN/../../kernels/quantized" ) endif() @@ -794,9 +784,45 @@ endif() if(EXECUTORCH_BUILD_KERNELS_CUSTOM) # TODO: move all custom kernels to ${CMAKE_CURRENT_SOURCE_DIR}/kernels/custom - add_subdirectory( - ${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/custom_ops - ) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/custom_ops) +endif() + +if(EXECUTORCH_BUILD_KERNELS_QUANTIZED) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/quantized) + target_link_options_shared_lib(quantized_ops_lib) +endif() + +if(EXECUTORCH_BUILD_EXECUTOR_RUNNER) + # Baseline libraries that executor_runner will link against. + set(_executor_runner_libs executorch gflags) + + if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) + list(APPEND _executor_runner_libs optimized_native_cpu_ops_lib) + elseif(EXECUTORCH_BUILD_CADENCE) + list(APPEND _executor_runner_libs cadence_ops_lib) + else() + list(APPEND _executor_runner_libs portable_ops_lib) + endif() + + # Generate lib to register quantized ops + if(EXECUTORCH_BUILD_KERNELS_QUANTIZED) + list(APPEND _executor_runner_libs quantized_ops_lib) + endif() + + add_executable(executor_runner ${_executor_runner__srcs}) + if(CMAKE_BUILD_TYPE STREQUAL "Release") + if(APPLE) + target_link_options(executor_runner PRIVATE "LINKER:-dead_strip") + else() + target_link_options(executor_runner PRIVATE "LINKER:--gc-sections") + endif() + endif() + target_link_libraries(executor_runner ${_executor_runner_libs}) + target_compile_options(executor_runner PUBLIC ${_common_compile_options}) +endif() + +if(EXECUTORCH_BUILD_VULKAN) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/vulkan) endif() # Print all summary diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2ad23f84d17..d434c1fe198 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -131,9 +131,7 @@ for detailed advice. #### C++ language version -**C++11.** - -NOTE: The code does not yet fully conform to this, and some files require C++17. +**C++17.** Rationale: This is a compromise between being compatible with older, proprietary toolchains, and having access to relatively modern C++ features. diff --git a/README.md b/README.md index c4e6e0caf75..0e78f4da356 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,9 @@ Key value propositions of ExecuTorch are: - **Portability:** Compatibility with a wide variety of computing platforms, from high-end mobile phones to highly constrained embedded systems and microcontrollers. -- **Productivity:** Enabling developers to use the same toolchains and SDK from - PyTorch model authoring and conversion, to debugging and deployment to a wide - variety of platforms. +- **Productivity:** Enabling developers to use the same toolchains and Developer + Tools from PyTorch model authoring and conversion, to debugging and deployment + to a wide variety of platforms. - **Performance:** Providing end users with a seamless and high-performance experience due to a lightweight runtime and utilizing full hardware capabilities such as CPUs, NPUs, and DSPs. @@ -22,6 +22,8 @@ please visit our documentation website [for the latest release](https://pytorch. Check out the [Getting Started](https://pytorch.org/executorch/stable/getting-started-setup.html#quick-setup-colab-jupyter-notebook-prototype) page for a quick spin. +Check out the examples of [Llama](./examples/models/llama2/README.md), [Llava](./examples/models/llava/README.md) and [other models](./examples/README.md) running on edge devices using ExecuTorch. + ## Feedback We welcome any feedback, suggestions, and bug reports from the community to help @@ -93,7 +95,7 @@ tools. ├── schema # ExecuTorch PTE file format flatbuffer schemas. ├── scripts # Utility scripts for size management, dependency management, etc. -├── sdk # Model profiling, debugging, and introspection. +├── devtools # Model profiling, debugging, and introspection. ├── shim # Compatibility layer between OSS and Internal builds ├── test # Broad scoped end-to-end tests. ├── third-party # Third-party dependencies. diff --git a/backends/apple/coreml/CMakeLists.txt b/backends/apple/coreml/CMakeLists.txt index 113b21bd690..27e09b3f581 100644 --- a/backends/apple/coreml/CMakeLists.txt +++ b/backends/apple/coreml/CMakeLists.txt @@ -13,11 +13,11 @@ if(NOT EXECUTORCH_ROOT) set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..) endif() -if(EXECUTORCH_BUILD_SDK) -# protobuf requires frtti -set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -frtti" ) +if(EXECUTORCH_BUILD_DEVTOOLS) + # protobuf requires frtti + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -frtti") endif() - + option(COREML_BUILD_EXECUTOR_RUNNER "Build CoreML executor runner." OFF) # inmemoryfs sources @@ -136,7 +136,7 @@ target_include_directories( target_include_directories(coremldelegate PRIVATE ${EXECUTORCH_ROOT}/..) target_link_libraries(coremldelegate PRIVATE executorch_no_prim_ops) -if(EXECUTORCH_BUILD_SDK) +if(EXECUTORCH_BUILD_DEVTOOLS) target_sources(coremldelegate PRIVATE ${SDK_SOURCES} ${PROTOBUF_SOURCES}) target_include_directories( coremldelegate @@ -174,7 +174,7 @@ endif() target_compile_options(coremldelegate PRIVATE "-fobjc-arc") target_compile_options(coremldelegate PRIVATE "-fno-exceptions") -if(EXECUTORCH_BUILD_SDK) +if(EXECUTORCH_BUILD_DEVTOOLS) target_compile_options( executorch_no_prim_ops PUBLIC -DET_EVENT_TRACER_ENABLED ) diff --git a/backends/apple/coreml/compiler/coreml_preprocess.py b/backends/apple/coreml/compiler/coreml_preprocess.py index 375fdf406b2..5084405c468 100644 --- a/backends/apple/coreml/compiler/coreml_preprocess.py +++ b/backends/apple/coreml/compiler/coreml_preprocess.py @@ -3,6 +3,7 @@ # CoreML backend for delegating a EdgeProgram to CoreML. import json +import logging import shutil import uuid @@ -14,6 +15,7 @@ from typing import Any, Dict, final, List, Optional, Tuple import coremltools as ct +import coremltools.optimize as cto import executorchcoreml from executorch.exir.backend.backend_details import ( @@ -23,12 +25,16 @@ ) from executorch.exir.backend.compile_spec_schema import CompileSpec +logger = logging.getLogger(__name__) +logger.setLevel(logging.WARNING) + class COMPILE_SPEC_KEYS(Enum): COMPUTE_UNITS = "compute_units" MODEL_TYPE = "model_type" MIN_DEPLOYMENT_TARGET = "min_deployment_target" MODEL_COMPUTE_PRECISION = "model_compute_precision" + OP_LINEAR_QUANTIZER_CONFIG = "op_linear_quantizer_config" class MODEL_PATHS(Enum): @@ -169,12 +175,44 @@ def generate_compute_unit_compile_spec( compute_unit.name.lower().encode("utf-8"), ) + @staticmethod + def generate_op_linear_quantizer_config_compile_spec( + op_linear_quantizer_config: Dict, + ) -> CompileSpec: + """ + Returns the compile spec representing the model post conversion quantization, + which is a dict that will construct cto.coreml.OpLinearQuantizerConfig + """ + str_representation = json.dumps(op_linear_quantizer_config) + byte_representation = str_representation.encode("utf-8") + return CompileSpec( + COMPILE_SPEC_KEYS.OP_LINEAR_QUANTIZER_CONFIG.value, + byte_representation, + ) + + @staticmethod + def op_linear_quantizer_config_from_compile_specs( + compile_specs: List[CompileSpec], + ) -> cto.coreml.OpLinearQuantizerConfig: + """ + Returns the model's post conversion quantization by parsing the list of compile specs. + """ + for compile_spec in compile_specs: + if compile_spec.key == COMPILE_SPEC_KEYS.OP_LINEAR_QUANTIZER_CONFIG.value: + config_dict_str = compile_spec.value.decode("utf-8") + config_dict = json.loads(config_dict_str) + config = cto.coreml.OpLinearQuantizerConfig._from_dict(config_dict) + return config + + return None + @staticmethod def generate_compile_specs( compute_unit: ct.ComputeUnit = ct.ComputeUnit.ALL, minimum_deployment_target: ct.target = ct.target.iOS15, compute_precision: ct.precision = ct.precision.FLOAT16, model_type: MODEL_TYPE = MODEL_TYPE.MODEL, + op_linear_quantizer_config: Optional[Dict] = None, ) -> List[CompileSpec]: """ Returns the list of compile specs that's used by CoreMLBackend to lower the module. @@ -192,6 +230,12 @@ def generate_compile_specs( CoreMLBackend.generate_compute_precision_compile_spec(compute_precision) ) compile_specs.append(CoreMLBackend.generate_model_type_compile_spec(model_type)) + if op_linear_quantizer_config is not None: + compile_specs.append( + CoreMLBackend.generate_op_linear_quantizer_config_compile_spec( + op_linear_quantizer_config + ) + ) return compile_specs @@ -368,18 +412,18 @@ def preprocess( compile_specs, ) ) - model_compute_precision: ct.precision = ( CoreMLBackend.model_compute_precision_from_compile_specs(compile_specs) ) - minimum_deployment_target: ct.target = ( CoreMLBackend.min_deployment_target_from_compile_specs(compile_specs) ) - compute_units: ct.ComputeUnit = CoreMLBackend.compute_unit_from_compile_specs( compile_specs ) + op_linear_quantizer_config = ( + CoreMLBackend.op_linear_quantizer_config_from_compile_specs(compile_specs) + ) mlmodel = ct.convert( model=edge_program, @@ -392,4 +436,15 @@ def preprocess( compute_units=compute_units, ) + if op_linear_quantizer_config is not None: + logger.warning( + "Core ML Backend op_linear_quantizer_config API is experimental" + ) + config = cto.coreml.OptimizationConfig( + global_config=op_linear_quantizer_config, + # skip embedding + op_type_configs={"gather": None}, + ) + mlmodel = cto.coreml.linear_quantize_weights(mlmodel, config=config) + return CoreMLBackend.preprocess_model(mlmodel, model_type=model_type) diff --git a/backends/apple/coreml/partition/coreml_partitioner.py b/backends/apple/coreml/partition/coreml_partitioner.py index ecf6d44b19c..c0b6663f729 100644 --- a/backends/apple/coreml/partition/coreml_partitioner.py +++ b/backends/apple/coreml/partition/coreml_partitioner.py @@ -17,7 +17,7 @@ Partitioner, PartitionResult, ) -from executorch.exir.backend.utils import tag_constant_data +from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer from torch.export.exported_program import ExportedProgram from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner from torch.fx.passes.operator_support import OperatorSupportBase @@ -61,6 +61,7 @@ def __init__( self, skip_ops_for_coreml_delegation: Optional[List[str]] = None, compile_specs: Optional[List[CompileSpec]] = None, + take_over_mutable_buffer: Optional[bool] = True, ) -> None: if skip_ops_for_coreml_delegation is None: skip_ops_for_coreml_delegation = [] @@ -69,6 +70,7 @@ def __init__( backend_id=CoreMLBackend.__name__, compile_specs=compile_specs if compile_specs is not None else [], ) + self.take_over_mutable_buffer = take_over_mutable_buffer def partition(self, exported_program: ExportedProgram) -> PartitionResult: # Run the CapabilityBasedPartitioner to return the largest possible @@ -89,6 +91,15 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult: partition_tags[tag] = self.delegation_spec tag_constant_data(exported_program) + if self.take_over_mutable_buffer: + logger.info( + "Core ML partitioner will take over torch mutable buffer as Core ML state, " + "so if your model contains mutable buffer, " + "then you will need MacOS15+/iOS18+ to execute. " + "If you want your mutable buffer model to be compatible with older OS, " + "then please set `take_over_mutable_buffer=False`" + ) + tag_mutated_buffer(exported_program) return PartitionResult( tagged_exported_program=exported_program, partition_tags=partition_tags diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm index 57316e28015..226307f3c8f 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLDefaultModelExecutor.mm @@ -29,9 +29,10 @@ - (instancetype)initWithModel:(ETCoreMLModel *)model { if (self.ignoreOutputBackings) { predictionOptions.outputBackings = @{}; } - id outputs = [self.model.mlModel predictionFromFeatures:inputs - options:predictionOptions - error:error]; + + id outputs = [self.model predictionFromFeatures:inputs + options:predictionOptions + error:error]; if (!outputs) { return nil; } diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.h b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.h index 9bf3183e65a..58026593462 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.h +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.h @@ -37,15 +37,12 @@ __attribute__((objc_subclassing_restricted)) orderedOutputNames:(NSOrderedSet*)orderedOutputNames error:(NSError* __autoreleasing*)error NS_DESIGNATED_INITIALIZER; -- (nullable NSArray*)prepareInputs:(const std::vector&)inputs - error:(NSError* __autoreleasing*)error; - -- (nullable NSArray*)prepareOutputBackings:(const std::vector&)outputs - error:(NSError* __autoreleasing*)error; - /// The underlying MLModel. @property (strong, readonly, nonatomic) MLModel* mlModel; +/// The model state. +@property (strong, readonly, nonatomic, nullable) id state; + /// The asset from which the model is loaded. @property (strong, readonly, nonatomic) ETCoreMLAsset* asset; @@ -58,6 +55,19 @@ __attribute__((objc_subclassing_restricted)) /// The ordered output names of the model. @property (copy, readonly, nonatomic) NSOrderedSet* orderedOutputNames; + +- (nullable id)predictionFromFeatures:(id)input + options:(MLPredictionOptions*)options + error:(NSError* __autoreleasing*)error; + +- (nullable NSArray*)prepareInputs:(const std::vector&)inputs + error:(NSError* __autoreleasing*)error; + +- (nullable NSArray*)prepareOutputBackings:(const std::vector&)outputs + error:(NSError* __autoreleasing*)error; + +- (BOOL)prewarmAndReturnError:(NSError* __autoreleasing*)error; + @end NS_ASSUME_NONNULL_END diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm index ee7218bd271..6b39ae5f920 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModel.mm @@ -7,10 +7,12 @@ #import -#import +#import "ETCoreMLAsset.h" +#import "ETCoreMLLogging.h" +#import "multiarray.h" +#import "objc_array_util.h" +#import "MLModel_Prewarm.h" #import -#import -#import #import #pragma mark - ETCoreMLMultiArrayDescriptor @@ -155,6 +157,19 @@ size_t get_number_of_bytes(MLMultiArrayDataType data_type) { return get_multi_array_constraints_by_name(description.outputDescriptionsByName); } +#if MODEL_STATE_IS_SUPPORTED +API_AVAILABLE(macos(15.0), ios(18.0), tvos(18.0), watchos(11.0)) +void reset_state_for_feature_name(NSString *feature_name, MLState *state) { + [state getMultiArrayForStateNamed:feature_name handler:^(MLMultiArray *buffer) { + [buffer getMutableBytesWithHandler:^(void *mutableBytes, NSInteger size, NSArray * __unused strides) { + uint8_t *start = reinterpret_cast(mutableBytes); + uint8_t *end = start + size; + std::fill(start, end, uint8_t(0)); + }]; + }]; +} +#endif + } #pragma mark - ETCoreMLModel @@ -194,6 +209,11 @@ - (nullable instancetype)initWithAsset:(ETCoreMLAsset *)asset _cache = [[NSCache alloc] init]; _inputConstraintsByName = get_multi_array_input_constraints_by_name(mlModel.modelDescription); _outputConstraintsByName = get_multi_array_output_constraints_by_name(mlModel.modelDescription); +#if MODEL_STATE_IS_SUPPORTED + if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, watchOS 11.0, *)) { + _state = mlModel.modelDescription.stateDescriptionsByName.count > 0 ? [_mlModel newState] : nil; + } +#endif } return self; @@ -272,4 +292,52 @@ MultiArray buffer(mutableBytes, MultiArray::MemoryLayout(to_multiarray_data_type } +- (nullable id)predictionFromFeatures:(id)input + options:(MLPredictionOptions *)options + error:(NSError **)error { +#if MODEL_STATE_IS_SUPPORTED + if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, watchOS 11.0, *)) { + if (self.state != nil) { + return [self.mlModel predictionFromFeatures:input + usingState:(MLState *)self.state + options:options + error:error]; + } + } +#endif + + id result = [self.mlModel predictionFromFeatures:input + options:options + error:error]; + + return result; +} + +- (BOOL)prewarmAndReturnError:(NSError* __autoreleasing*)error { + NSError *localError = nil; + BOOL result = [self.mlModel prewarmUsingState:self.state error:error]; + if (!result) { + ETCoreMLLogError(localError, + "%@: Failed to prewarm model with identifier = %@", + NSStringFromClass(self.class), + self.identifier); + } + +#if MODEL_STATE_IS_SUPPORTED + if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, watchOS 11.0, *)) { + NSDictionary *stateDescriptions = self.mlModel.modelDescription.stateDescriptionsByName; + [stateDescriptions enumerateKeysAndObjectsUsingBlock:^(NSString *featureName, MLFeatureDescription * __unused obj, BOOL * __unused stop) { + reset_state_for_feature_name(featureName, (MLState *) self.state); + }]; + } +#endif + + + if (error) { + *error = localError; + } + + return result; +} + @end diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm index 8d6d537385b..cd0fbc86f99 100644 --- a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm +++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm @@ -598,21 +598,8 @@ - (BOOL)prewarmModelWithHandle:(ModelHandle *)handle if (!model) { return NO; } - - NSError *localError = nil; - BOOL result = [model.mlModel prewarmAndReturnError:&localError]; - if (!result) { - ETCoreMLLogError(localError, - "%@: Failed to prewarm model with identifier = %@", - NSStringFromClass(self.assetManager.class), - model.identifier); - } - - if (error) { - *error = localError; - } - - return result; + + return [model prewarmAndReturnError:error]; } - (void)prewarmRecentlyUsedAssetsWithMaxCount:(NSUInteger)maxCount { @@ -682,16 +669,15 @@ - (void)addPrewarmedAsset:(ETCoreMLAsset *)asset { error:&localError]; // Try without output backings. if (!modelOutputs && predictionOptions.outputBackings.count > 0) { - localError = nil; executor.ignoreOutputBackings = YES; + localError = nil; + modelOutputs = [executor executeModelWithInputs:inputFeatures + predictionOptions:predictionOptions + loggingOptions:loggingOptions + eventLogger:eventLogger + error:&localError]; } - - modelOutputs = [executor executeModelWithInputs:inputFeatures - predictionOptions:predictionOptions - loggingOptions:loggingOptions - eventLogger:eventLogger - error:&localError]; - + if (error) { *error = localError; } diff --git a/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.h b/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.h index c066608b893..6caf99507dc 100644 --- a/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.h +++ b/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.h @@ -8,6 +8,9 @@ #import +#if !defined(MODEL_STATE_IS_SUPPORTED) && __has_include() +#define MODEL_STATE_IS_SUPPORTED 1 +#endif NS_ASSUME_NONNULL_BEGIN @@ -15,9 +18,10 @@ NS_ASSUME_NONNULL_BEGIN /// Pre-warms the model by running a prediction with zeroed-out inputs. /// +/// @param state The model state. /// @param error On failure, error is filled with the failure information. /// @retval `YES` if the prediction succeeded otherwise `NO`. -- (BOOL)prewarmAndReturnError:(NSError* __autoreleasing*)error; +- (BOOL)prewarmUsingState:(nullable id)state error:(NSError* __autoreleasing*)error; @end diff --git a/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.mm b/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.mm index 71ce967ac3e..d6f59666cf0 100644 --- a/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.mm +++ b/backends/apple/coreml/runtime/delegate/MLModel_Prewarm.mm @@ -71,16 +71,28 @@ + (MLMultiArray *)zeroedMultiArrayWithShape:(NSArray *)shape @implementation MLModel (Prewarm) -- (BOOL)prewarmAndReturnError:(NSError * __autoreleasing *)error { +- (BOOL)prewarmUsingState:(nullable id)state error:(NSError * __autoreleasing *)error { @autoreleasepool { id inputs = ::get_zeroed_inputs(self, error); if (!inputs) { return NO; } - - id outputs = [self predictionFromFeatures:inputs error:error]; + + + id outputs = nil; + if (state != nil) { +#if MODEL_STATE_IS_SUPPORTED + if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, watchOS 11.0, *)) { + outputs = [self predictionFromFeatures:inputs usingState:(MLState *)state error:error]; + return outputs != nil; + } +#endif + } + + outputs = [self predictionFromFeatures:inputs error:error]; return outputs != nil; } } + @end diff --git a/backends/apple/coreml/runtime/include/coreml_backend/delegate.h b/backends/apple/coreml/runtime/include/coreml_backend/delegate.h index a11d41bf7f4..1943e0f05b0 100644 --- a/backends/apple/coreml/runtime/include/coreml_backend/delegate.h +++ b/backends/apple/coreml/runtime/include/coreml_backend/delegate.h @@ -20,7 +20,7 @@ class BackendDelegate; namespace torch { namespace executor { -class CoreMLBackendDelegate final : public PyTorchBackendInterface { +class CoreMLBackendDelegate final : public ::executorch::runtime::BackendInterface { public: CoreMLBackendDelegate() noexcept; ~CoreMLBackendDelegate() = default; diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm b/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm index 1740faf00e6..988b5d808a0 100644 --- a/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm +++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelAnalyzer.mm @@ -88,10 +88,9 @@ - (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset *)compiledMod eventLogger:(const executorchcoreml::ModelEventLogger *)eventLogger error:(NSError * __autoreleasing *)error { if (self.profiler == nil) { - ETCoreMLModelProfiler *profiler = [[ETCoreMLModelProfiler alloc] initWithCompiledModelAsset:self.model.asset - outputNames:self.model.orderedOutputNames - configuration:self.configuration - error:error]; + ETCoreMLModelProfiler *profiler = [[ETCoreMLModelProfiler alloc] initWithModel:self.model + configuration:self.configuration + error:error]; self.profiler = profiler; } diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.h b/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.h index 07a384a5167..7a43a30d752 100644 --- a/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.h +++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.h @@ -31,14 +31,12 @@ __attribute__((objc_subclassing_restricted)) /// Constructs an `ETCoreMLModelProfiler` instance. /// -/// @param compiledModelAsset The compiled model asset (mlmodelc). -/// @param outputNames The model output names. +/// @param model The model. /// @param configuration The model configuration. /// @param error On failure, error is filled with the failure information. -- (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset*)compiledModelAsset - outputNames:(NSOrderedSet*)outputNames - configuration:(MLModelConfiguration*)configuration - error:(NSError* __autoreleasing*)error NS_DESIGNATED_INITIALIZER; +- (nullable instancetype)initWithModel:(ETCoreMLModel*)model + configuration:(MLModelConfiguration*)configuration + error:(NSError* __autoreleasing*)error NS_DESIGNATED_INITIALIZER; /// Returns profiling info of operations at the specified paths. /// diff --git a/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.mm b/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.mm index c9ad324a6c0..5998701eb0f 100644 --- a/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.mm +++ b/backends/apple/coreml/runtime/sdk/ETCoreMLModelProfiler.mm @@ -8,6 +8,7 @@ #import "ETCoreMLModelProfiler.h" #import "ETCoreMLAsset.h" +#import "ETCoreMLModel.h" #import "ETCoreMLLogging.h" #import "ETCoreMLModelStructurePath.h" #import "ETCoreMLOperationProfilingInfo.h" @@ -221,8 +222,8 @@ void set_model_outputs(id output_features, } @interface ETCoreMLModelProfiler () -/// The CoreML model. -@property (readonly, strong, nonatomic) MLModel *model; +/// The model. +@property (readonly, strong, nonatomic) ETCoreMLModel *model; /// The model output names. @property (readonly, copy, nonatomic) NSOrderedSet *outputNames; #if MODEL_PROFILING_IS_AVAILABLE @@ -240,25 +241,19 @@ @interface ETCoreMLModelProfiler () @implementation ETCoreMLModelProfiler -- (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset *)compiledModelAsset - outputNames:(NSOrderedSet *)outputNames - configuration:(MLModelConfiguration *)configuration - error:(NSError * __autoreleasing *)error { +- (nullable instancetype)initWithModel:(ETCoreMLModel *)model + configuration:(MLModelConfiguration *)configuration + error:(NSError * __autoreleasing *)error { #if MODEL_PROFILING_IS_AVAILABLE if (@available(macOS 14.4, iOS 17.4, tvOS 17.4, watchOS 10.4, *)) { - NSURL *compiledModelURL = compiledModelAsset.contentURL; + NSURL *compiledModelURL = model.asset.contentURL; MLComputePlan *computePlan = get_compute_plan_of_model_at_url(compiledModelURL, configuration, error); if (!computePlan) { return nil; } - - MLModel *model = [MLModel modelWithContentsOfURL:compiledModelURL error:error]; - if (!model) { - return nil; - } - + __block NSMutableArray *operationPaths = [NSMutableArray array]; __block NSMutableDictionary *operationToPathMap = [NSMutableDictionary dictionary]; __block NSMutableArray *topologicallySortedOperations = [NSMutableArray new]; @@ -280,7 +275,6 @@ - (nullable instancetype)initWithCompiledModelAsset:(ETCoreMLAsset *)compiledMod self = [super init]; if (self) { - _outputNames = [outputNames copy]; _model = model; _computePlan = computePlan; _operationToPathMap = operationToPathMap; diff --git a/backends/apple/coreml/runtime/test/CoreMLBackendDelegateTests.mm b/backends/apple/coreml/runtime/test/CoreMLBackendDelegateTests.mm index d7218905fc2..691d4d726ed 100644 --- a/backends/apple/coreml/runtime/test/CoreMLBackendDelegateTests.mm +++ b/backends/apple/coreml/runtime/test/CoreMLBackendDelegateTests.mm @@ -13,6 +13,8 @@ #import #import +#import "MLModel_Prewarm.h" + static constexpr size_t kRuntimeMemorySize = 50 * 1024U * 1024U; // 50 MB using namespace torch::executor; @@ -184,20 +186,28 @@ - (void)executeModelAtURL:(NSURL *)modelURL nLoads:(NSUInteger)nLoads nExecution - (void)testAddProgramExecute { NSURL *modelURL = [[self class] bundledResourceWithName:@"add_coreml_all" extension:@"pte"]; XCTAssertNotNil(modelURL); - [self executeModelAtURL:modelURL nLoads:5 nExecutions:2]; + [self executeModelAtURL:modelURL nLoads:1 nExecutions:2]; } - (void)testMulProgramExecute { NSURL *modelURL = [[self class] bundledResourceWithName:@"mul_coreml_all" extension:@"pte"]; XCTAssertNotNil(modelURL); - [self executeModelAtURL:modelURL nLoads:5 nExecutions:2]; + [self executeModelAtURL:modelURL nLoads:1 nExecutions:2]; } - (void)testMV3ProgramExecute { NSURL *modelURL = [[self class] bundledResourceWithName:@"mv3_coreml_all" extension:@"pte"]; XCTAssertNotNil(modelURL); - [self executeModelAtURL:modelURL nLoads:5 nExecutions:2]; + [self executeModelAtURL:modelURL nLoads:1 nExecutions:2]; +} + +#if MODEL_STATE_IS_SUPPORTED +- (void)testStateProgramExecute { + NSURL *modelURL = [[self class] bundledResourceWithName:@"state_coreml_all" extension:@"pte"]; + XCTAssertNotNil(modelURL); + [self executeModelAtURL:modelURL nLoads:1 nExecutions:2]; } +#endif - (void)executeMultipleModelsConcurrently:(NSArray *)modelURLs nLoads:(NSUInteger)nLoads diff --git a/backends/apple/coreml/runtime/test/export_stateful_model.py b/backends/apple/coreml/runtime/test/export_stateful_model.py new file mode 100644 index 00000000000..61d1a93980f --- /dev/null +++ b/backends/apple/coreml/runtime/test/export_stateful_model.py @@ -0,0 +1,77 @@ +# Copyright © 2024 Apple Inc. All rights reserved. +# +# Please refer to the license found in the LICENSE file in the root directory of the source tree. + +import os +from pathlib import Path + +import coremltools as ct +import executorch.exir as exir + +import torch + +from executorch.backends.apple.coreml.compiler import CoreMLBackend +from executorch.backends.apple.coreml.partition import CoreMLPartitioner +from torch.export import export + + +class StatefulModel(torch.nn.Module): + def __init__( + self, + embedding_dim: int, + max_seq_len: int, + ): + super().__init__() + self.register_buffer( + "cache", torch.zeros((max_seq_len, embedding_dim), dtype=torch.float32) + ) + + def forward( + self, + q: torch.Tensor, + k_val: torch.Tensor, + input_pos: torch.Tensor, + ): + q_T = q.transpose(0, 1) + k = torch.ops.aten.index_put_(self.cache, [input_pos, None], k_val) + attn = k.mm(q_T) + return attn + + +def main() -> None: + embedding_dim = 3 + max_seq_len = 2 + model = StatefulModel(embedding_dim=embedding_dim, max_seq_len=max_seq_len) + example_inputs = ( + torch.randn((1, embedding_dim)), + torch.randn((1, embedding_dim)), + torch.tensor([0]), + ) + exported_model = export(model, example_inputs) + edge_program_manager = exir.to_edge(exported_model) + compile_specs = CoreMLBackend.generate_compile_specs( + compute_precision=ct.precision.FLOAT16, + compute_unit=ct.ComputeUnit.ALL, + minimum_deployment_target=ct.target.iOS18, + ) + + partitioner = CoreMLPartitioner( + skip_ops_for_coreml_delegation=None, + compile_specs=compile_specs, + ) + + delegated_program_manager = edge_program_manager.to_backend(partitioner) + exec_program = delegated_program_manager.to_executorch( + config=exir.ExecutorchBackendConfig(extract_delegate_segments=True) + ) + + buffer = exec_program.buffer + models_dir = Path(os.path.dirname(os.path.realpath(__file__))) / "models" + models_dir.mkdir(parents=False, exist_ok=True) + file_path = models_dir / "state_coreml_all.pte" + with open(file_path.resolve(), "wb") as file: + file.write(buffer) + + +if __name__ == "__main__": + main() # pragma: no cover diff --git a/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj b/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj index c347c56db03..2daa5615ba9 100644 --- a/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj +++ b/backends/apple/coreml/runtime/workspace/executorchcoreml.xcodeproj/project.pbxproj @@ -7,6 +7,7 @@ objects = { /* Begin PBXBuildFile section */ + 8307EB8A2C9262060011AE6D /* state_coreml_all.pte in Resources */ = {isa = PBXBuildFile; fileRef = 8307EB892C9262060011AE6D /* state_coreml_all.pte */; }; 83BB78A02C65DA7300274ED7 /* ETCoreMLModelDebugInfo.mm in Sources */ = {isa = PBXBuildFile; fileRef = 83BB789F2C65DA7300274ED7 /* ETCoreMLModelDebugInfo.mm */; }; 83BB78BF2C66AAAE00274ED7 /* add_mul_coreml_all.bin in Resources */ = {isa = PBXBuildFile; fileRef = 83BB78BD2C66AAAE00274ED7 /* add_mul_coreml_all.bin */; }; 83BB78C02C66AAAE00274ED7 /* add_mul_coreml_all.pte in Resources */ = {isa = PBXBuildFile; fileRef = 83BB78BE2C66AAAE00274ED7 /* add_mul_coreml_all.pte */; }; @@ -120,6 +121,7 @@ /* End PBXCopyFilesBuildPhase section */ /* Begin PBXFileReference section */ + 8307EB892C9262060011AE6D /* state_coreml_all.pte */ = {isa = PBXFileReference; lastKnownFileType = file; name = state_coreml_all.pte; path = ../test/models/state_coreml_all.pte; sourceTree = ""; }; 83BB789E2C65DA7300274ED7 /* ETCoreMLModelDebugInfo.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = ETCoreMLModelDebugInfo.h; path = ../sdk/ETCoreMLModelDebugInfo.h; sourceTree = ""; }; 83BB789F2C65DA7300274ED7 /* ETCoreMLModelDebugInfo.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; name = ETCoreMLModelDebugInfo.mm; path = ../sdk/ETCoreMLModelDebugInfo.mm; sourceTree = ""; }; 83BB78BD2C66AAAE00274ED7 /* add_mul_coreml_all.bin */ = {isa = PBXFileReference; lastKnownFileType = archive.macbinary; name = add_mul_coreml_all.bin; path = ../test/models/add_mul_coreml_all.bin; sourceTree = ""; }; @@ -607,6 +609,7 @@ C98551982AD2542D009143F9 /* mv3_coreml_all.pte */, 83BB78BD2C66AAAE00274ED7 /* add_mul_coreml_all.bin */, 83BB78BE2C66AAAE00274ED7 /* add_mul_coreml_all.pte */, + 8307EB892C9262060011AE6D /* state_coreml_all.pte */, ); name = models; sourceTree = ""; @@ -677,6 +680,7 @@ C985519E2AD2542D009143F9 /* mv3_coreml_all.pte in Resources */, C98551A02AD2542D009143F9 /* add_coreml_all.bin in Resources */, C98551A22AD2542D009143F9 /* mul_coreml_all.pte in Resources */, + 8307EB8A2C9262060011AE6D /* state_coreml_all.pte in Resources */, C98551A32AD2542D009143F9 /* add_coreml_all.pte in Resources */, ); runOnlyForDeploymentPostprocessing = 0; diff --git a/backends/apple/coreml/scripts/generate_test_models.sh b/backends/apple/coreml/scripts/generate_test_models.sh index bbe9809ff8d..0c1822aa828 100755 --- a/backends/apple/coreml/scripts/generate_test_models.sh +++ b/backends/apple/coreml/scripts/generate_test_models.sh @@ -17,14 +17,17 @@ cd "$EXECUTORCH_ROOT_PATH" mkdir "$COREML_DIR_PATH/runtime/test/models/" #Generate models -echo "Executorch: Generating test models" cd "$EXECUTORCH_ROOT_PATH" MODELS=("add" "add_mul" "mul" "mv3") for MODEL in "${MODELS[@]}" do + echo "Executorch: Generating $MODEL model" # TODO: Don't use the script in examples directory. python3 -m examples.apple.coreml.scripts.export --model_name "$MODEL" --save_processed_bytes mv -f "$MODEL""_coreml_all.pte" "$COREML_DIR_PATH/runtime/test/models" mv -f "$MODEL""_coreml_all.bin" "$COREML_DIR_PATH/runtime/test/models" done + +echo "Executorch: Generating stateful model" +python3 "$SCRIPT_DIR_PATH/../runtime/test/export_stateful_model.py" diff --git a/backends/apple/coreml/scripts/install_requirements.sh b/backends/apple/coreml/scripts/install_requirements.sh index 0018b5ffc2d..b3ea0d77ca0 100755 --- a/backends/apple/coreml/scripts/install_requirements.sh +++ b/backends/apple/coreml/scripts/install_requirements.sh @@ -24,7 +24,7 @@ rm -rf "$COREML_DIR_PATH/third-party" mkdir "$COREML_DIR_PATH/third-party" echo "${green}ExecuTorch: Cloning coremltools." -git clone --depth 1 --branch 8.0b1 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH +git clone --depth 1 --branch 8.0 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH cd $COREMLTOOLS_DIR_PATH STATUS=$? @@ -47,6 +47,11 @@ cmake --build "$COREMLTOOLS_DIR_PATH/build" --parallel echo "${green}ExecuTorch: Installing coremltools." pip install "$COREMLTOOLS_DIR_PATH" +# CoreMLTools have started supporting numpy 2.0, +# but ExecuTorch example model test env is still using older transformers, +# so for now we will need to downgrade numpy to 1.x +# TODO: Remove this numpy downgrade once later transformers starts to be used +pip install numpy==1.26.4 STATUS=$? if [ $STATUS -ne 0 ]; then echo "${red}ExecuTorch: Failed to install coremltools." diff --git a/backends/apple/coreml/test/test_coreml_partitioner.py b/backends/apple/coreml/test/test_coreml_partitioner.py index 34cf531b261..72a7fbf0932 100644 --- a/backends/apple/coreml/test/test_coreml_partitioner.py +++ b/backends/apple/coreml/test/test_coreml_partitioner.py @@ -4,11 +4,14 @@ import unittest +import coremltools as ct + import executorch.exir import torch import torchvision +from executorch.backends.apple.coreml.compiler import CoreMLBackend from executorch.backends.apple.coreml.partition import CoreMLPartitioner @@ -86,8 +89,54 @@ def test_vit_skip_conv(self): if node.op == "call_function" ] == total + def test_buffer(self): + embedding_dim = 3 + max_seq_len = 2 + + class Model(torch.nn.Module): + def __init__(self): + super().__init__() + self.register_buffer( + "cache", + torch.zeros((max_seq_len, embedding_dim), dtype=torch.float32), + ) + + def forward(self, q, k_val, input_pos): + q_T = q.transpose(0, 1) + k = torch.ops.aten.index_put_(self.cache, [input_pos, None], k_val) + attn = k.mm(q_T) + return attn + + model = Model() + model.eval() + + q = torch.randn((1, embedding_dim)) + k_val = torch.randn((1, embedding_dim)) + input_pos = torch.tensor([0]) + example_inputs = (q, k_val, input_pos) + exir_program_aten = torch.export.export(model, example_inputs) + + compile_specs = CoreMLBackend.generate_compile_specs( + minimum_deployment_target=ct.target.iOS18 + ) + partitioner = CoreMLPartitioner(compile_specs=compile_specs) + edge_program_manager = executorch.exir.to_edge( + exir_program_aten, compile_config=self.edge_compile_config + ) + delegated_program_manager = edge_program_manager.to_backend(partitioner) + + assert [ + node.target.__name__ + for node in delegated_program_manager.exported_program().graph.nodes + if node.op == "call_function" + ] == [ + "executorch_call_delegate", + "getitem", + ] + if __name__ == "__main__": test_runner = TestCoreMLPartitioner() test_runner.test_add_sub_skip_mm() test_runner.test_vit_skip_conv() + test_runner.test_buffer() diff --git a/backends/apple/mps/TARGETS b/backends/apple/mps/TARGETS index b8ab3427a9e..1ab92b3fca0 100644 --- a/backends/apple/mps/TARGETS +++ b/backends/apple/mps/TARGETS @@ -95,8 +95,8 @@ runtime.python_test( "//executorch/examples/models:models", "//executorch/exir/tests:models", "//executorch/extension/export_util:export_util", - "//executorch/sdk:lib", - "//executorch/sdk/bundled_program/serialize:lib", + "//executorch/devtools:lib", + "//executorch/devtools/bundled_program/serialize:lib", "fbsource//third-party/pypi/pytest:pytest", ], ) diff --git a/backends/apple/mps/operators/node_visitor.py b/backends/apple/mps/operators/node_visitor.py index d2f7219748a..2b443134bf8 100644 --- a/backends/apple/mps/operators/node_visitor.py +++ b/backends/apple/mps/operators/node_visitor.py @@ -77,7 +77,7 @@ def define_tensor( """Defines a tensor value into the MPSGraph serialization schema Args: - tensor (torch.fx.Node): EdgeIR tensor to define into mps_graph + node (torch.fx.Node): EdgeIR tensor to define into mps_graph mps_graph (MPSGraph): MPSGraph object for serializing into flatbuffer """ @@ -155,7 +155,7 @@ def define_constant( """Defines a scalar value into the MPSGraph serialization schema Args: - tensor (torch.fx.Node): EdgeIR tensor to define into mps_graph + constant_tensor (torch.fx.Node): EdgeIR tensor to define into mps_graph mps_graph (MPSGraph): MPSGraph object for serializing into flatbuffer """ constant_tensor = constant_tensor.contiguous() @@ -191,7 +191,6 @@ def define_scalar( """Defines a scalar value into the MPSGraph serialization schema Args: - tensor (torch.fx.Node): EdgeIR tensor to define into mps_graph mps_graph (MPSGraph): MPSGraph object for serializing into flatbuffer """ assert isinstance(val, int) or isinstance(val, float) @@ -229,7 +228,7 @@ def get_serialized_buffer( index of its placement in the constant buffer Args: - tensor (torch.fx.Node): _description_ + node (torch.fx.Node): _description_ mps_graph (MPSGraph): _description_ Returns: @@ -299,7 +298,7 @@ def get_serialized_id( the existent id. Args: - tensor (Union[torch.fx.Node, float]): _description_ + node (Union[torch.fx.Node, float]): _description_ mps_graph (MPSGraph): _description_ Returns: diff --git a/backends/apple/mps/runtime/MPSBackend.mm b/backends/apple/mps/runtime/MPSBackend.mm index b94bdc9319b..cb96edbeb2e 100644 --- a/backends/apple/mps/runtime/MPSBackend.mm +++ b/backends/apple/mps/runtime/MPSBackend.mm @@ -19,7 +19,7 @@ namespace torch { namespace executor { -class MPSBackend final : public PyTorchBackendInterface { +class MPSBackend final : public ::executorch::runtime::BackendInterface { public: ~MPSBackend() = default; diff --git a/backends/apple/mps/runtime/operations/OperationUtils.mm b/backends/apple/mps/runtime/operations/OperationUtils.mm index c3c5c93362a..2336868863d 100644 --- a/backends/apple/mps/runtime/operations/OperationUtils.mm +++ b/backends/apple/mps/runtime/operations/OperationUtils.mm @@ -31,8 +31,13 @@ return MPSDataTypeFloat32; case DataType::mps_data_type_int8: return MPSDataTypeInt8; - case DataType::mps_data_type_int4: - return MPSDataTypeInt4; + case DataType::mps_data_type_int4: { + if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, *)) { + return MPSDataTypeInt4; + } else { + return ((MPSDataType)(MPSDataTypeSignedBit | 4)); + } + } case DataType::mps_data_type_int16: return MPSDataTypeInt16; case DataType::mps_data_type_int32: diff --git a/backends/apple/mps/runtime/operations/QuantDequant.mm b/backends/apple/mps/runtime/operations/QuantDequant.mm index 7818bab2565..c37282f79a1 100644 --- a/backends/apple/mps/runtime/operations/QuantDequant.mm +++ b/backends/apple/mps/runtime/operations/QuantDequant.mm @@ -30,17 +30,19 @@ MPSGraphTensor* inputTensor = getMPSGraphTensor(graphNode->input1_id()); MPSGraphTensor* scalesTensor = getMPSGraphTensor(graphNode->scales_id()); - - MPSGraphTensor *zpTensor = [_mpsGraph constantWithScalar:0 + if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, *)) { + MPSGraphTensor *zpTensor = [_mpsGraph constantWithScalar:0 dataType:MPSDataTypeInt4]; + MPSGraphTensor *wDqTensor = [_mpsGraph dequantizeTensor:inputTensor + scaleTensor:scalesTensor + zeroPointTensor:zpTensor + dataType:MPSDataTypeFloat16 + name:nil]; + _idToMPSGraphTensor[graphNode->output_id()] = wDqTensor; + } else { + _idToMPSGraphTensor[graphNode->output_id()] = nil; + } - MPSGraphTensor *wDqTensor = [_mpsGraph dequantizeTensor:inputTensor - scaleTensor:scalesTensor - zeroPointTensor:zpTensor - dataType:MPSDataTypeFloat16 - name:nil]; - - _idToMPSGraphTensor[graphNode->output_id()] = wDqTensor; return Error::Ok; } diff --git a/backends/apple/mps/targets.bzl b/backends/apple/mps/targets.bzl index 8b9c64e143c..74d79448362 100644 --- a/backends/apple/mps/targets.bzl +++ b/backends/apple/mps/targets.bzl @@ -47,7 +47,7 @@ def define_common_targets(is_xplat = False, platforms = []): "//executorch/exir/backend:backend_lib", "//executorch/extension/pybindings/...", "//executorch/runtime/backend/...", - "//executorch/sdk/runners/...", + "//executorch/devtools/runners/...", "//executorch/test/...", "@EXECUTORCH_CLIENTS", ], diff --git a/backends/apple/mps/test/test_mps_utils.py b/backends/apple/mps/test/test_mps_utils.py index d7efe8bde41..6f7d00d7b09 100644 --- a/backends/apple/mps/test/test_mps_utils.py +++ b/backends/apple/mps/test/test_mps_utils.py @@ -12,16 +12,16 @@ import torch from executorch.backends.apple.mps import MPSBackend from executorch.backends.apple.mps.partition import MPSPartitioner +from executorch.devtools import BundledProgram +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.serialize import ( + serialize_from_bundled_program_to_flatbuffer, +) from executorch.exir import EdgeCompileConfig, ExirExportedProgram, to_edge from executorch.exir.backend.backend_api import to_backend from executorch.exir.backend.backend_details import CompileSpec from executorch.exir.capture._config import ExecutorchBackendConfig from executorch.extension.export_util.utils import export_to_edge -from executorch.sdk import BundledProgram -from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite -from executorch.sdk.bundled_program.serialize import ( - serialize_from_bundled_program_to_flatbuffer, -) from torch.export import export # Config for Capturing the weights, will be moved in the future @@ -229,7 +229,7 @@ def lower_module_and_test_output( compile_specs = [CompileSpec("use_fp16", bytes([use_fp16]))] if use_partitioner: - logging.info(f"Edge IR graph:\n{edge_program.exported_program().graph}") + logging.info(f"Edge IR graph:\n{edge_program.exported_program()}") delegated_program = edge_program delegated_program = edge_program.to_backend( MPSPartitioner(compile_specs=compile_specs) @@ -239,9 +239,7 @@ def lower_module_and_test_output( ) executorch_program = delegated_program.to_executorch( - config=ExecutorchBackendConfig( - extract_delegate_segments=False, extract_constant_segment=False - ) + config=ExecutorchBackendConfig(extract_delegate_segments=False) ) else: delegated_program = to_backend( @@ -258,9 +256,7 @@ def lower_module_and_test_output( _skip_dim_order=True, # TODO(T182928844): Delegate dim order op to backend. ), ).to_executorch( - config=ExecutorchBackendConfig( - extract_delegate_segments=False, extract_constant_segment=False - ) + config=ExecutorchBackendConfig(extract_delegate_segments=False) ) if bundled_program: diff --git a/backends/apple/mps/utils/mps_utils.py b/backends/apple/mps/utils/mps_utils.py index b6ba215534d..c31ebba0e46 100644 --- a/backends/apple/mps/utils/mps_utils.py +++ b/backends/apple/mps/utils/mps_utils.py @@ -73,7 +73,7 @@ def is_parameter(exp_prog: torch.export.ExportedProgram, node: torch.fx.Node) -> are supplied as inputs to the graph. Args: - edge_program (torch.export.ExportedProgram): _description_ + exp_prog (torch.export.ExportedProgram): _description_ node (torch.fx.Node): _description_ Returns: diff --git a/backends/arm/README.md b/backends/arm/README.md index 7167aa853b6..6f4642f8d44 100644 --- a/backends/arm/README.md +++ b/backends/arm/README.md @@ -9,7 +9,7 @@ The expected flow is: * torch.nn.module -> TOSA -> command_stream for fully AoT flows e.g. embedded. * torch.nn.module -> TOSA for flows supporting a JiT compilation step. -Current backend support is being developed for TOSA to Ethos(TM)-U55/65 via the +Current backend support is being developed for TOSA to Ethos(TM)-U55/65/85 via the ethos-u-vela compilation stack. which follows the fully AoT flow. ## Layout @@ -33,7 +33,7 @@ Quantization: - `arm_quantizer_utils.py` - Utilities for quantization Runtime: -- `runtime/ArmBackendEthosU.cpp` - The Arm backend implementation of the ExecuTorch runtime backend (PyTorchBackendInterface) for Ethos-U +- `runtime/ArmBackendEthosU.cpp` - The Arm backend implementation of the ExecuTorch runtime backend (BackendInterface) for Ethos-U Other: - `third-party/` - Dependencies on other code - in particular the TOSA serialization_lib for compiling to TOSA and the ethos-u-core-driver for the bare-metal backend supporting Ethos-U diff --git a/backends/arm/TARGETS b/backends/arm/TARGETS new file mode 100644 index 00000000000..220db373710 --- /dev/null +++ b/backends/arm/TARGETS @@ -0,0 +1,83 @@ +load("@fbcode_macros//build_defs:python_library.bzl", "python_library") + +python_library( + name = "arm_partitioner", + srcs = [ + "arm_partitioner.py", + ], + typing = True, + deps = [ + ":arm_backend", + "//executorch/backends/arm/passes:passes", + "//executorch/exir:lib", + ], +) + +python_library( + name = "arm_backend", + srcs = [ + "arm_backend.py", + ], + typing = True, + deps = [ + "fbsource//third-party/pypi/flatbuffers:flatbuffers", + "fbsource//third-party/pypi/ml-dtypes:ml-dtypes", + "fbsource//third-party/serialization_lib/python/serializer:serializer", + "fbsource//third-party/serialization_lib/python/tosa:tosa", + ":arm_vela", + "//executorch/backends/arm/operators:lib", + "//executorch/backends/arm/operators:node_visitor", + "//executorch/backends/arm/passes:passes", + ], +) + +python_library( + name = "arm_vela", + srcs = [ + "arm_vela.py", + ], + typing = True, + deps = [ + "fbsource//third-party/pypi/ethos-u-vela:ethos-u-vela", + ], +) + +python_library( + name = "tosa_mapping", + srcs = [ + "tosa_mapping.py", + ], + typing = True, + deps = [ + "fbsource//third-party/serialization_lib/python/serializer:serializer", + "//caffe2:torch", + ], +) + +python_library( + name = "tosa_quant_utils", + srcs = [ + "tosa_quant_utils.py", + ], + typing = True, + deps = [ + "fbsource//third-party/pypi/numpy:numpy", + "fbsource//third-party/serialization_lib/python/serializer:serializer", + "fbsource//third-party/serialization_lib/python/tosa:tosa", + ":tosa_mapping", + "//executorch/exir/dialects:lib", + ], +) + +python_library( + name = "tosa_utils", + srcs = [ + "tosa_utils.py", + ], + typing = True, + deps = [ + "fbsource//third-party/serialization_lib/python/serializer:serializer", + ":tosa_quant_utils", + "//executorch/backends/arm/operators:node_visitor", + ], +) diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py index f187191fee0..b83280763c2 100644 --- a/backends/arm/arm_backend.py +++ b/backends/arm/arm_backend.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + # # Main implementation of AoT flow to partition and preprocess for Arm target # backends. Converts via TOSA as an intermediate form supported by AoT and @@ -50,11 +52,11 @@ def __init__(self): def ethosu_compile_spec( self, config: str, - system_config: Optional[str] = None, - memory_mode: Optional[str] = None, + system_config: str, + memory_mode: str, extra_flags: Optional[str] = None, config_ini: Optional[str] = "Arm/vela.ini", - ): + ) -> "ArmCompileSpecBuilder": """ Generate compile spec for Ethos-U NPU @@ -84,7 +86,7 @@ def ethosu_compile_spec( return self - def tosa_compile_spec(self): + def tosa_compile_spec(self) -> "ArmCompileSpecBuilder": """ Generate compile spec for TOSA flatbuffer output """ @@ -94,14 +96,18 @@ def tosa_compile_spec(self): self.output_format = "tosa" return self - def dump_intermediate_artifacts_to(self, output_path: str): + def dump_intermediate_artifacts_to( + self, output_path: str + ) -> "ArmCompileSpecBuilder": """ Sets a path for dumping intermediate results during such as tosa and pte. """ self.path_for_intermediates = output_path return self - def set_permute_memory_format(self, set_nhwc_permutation: bool = True): + def set_permute_memory_format( + self, set_nhwc_permutation: bool = True + ) -> "ArmCompileSpecBuilder": """ Permute to channel last in compiler and runtime. Compilation and runtime will convert rank 4 inputs to channel last for each sub-graph. @@ -109,7 +115,7 @@ def set_permute_memory_format(self, set_nhwc_permutation: bool = True): self.permute_nhwc = set_nhwc_permutation return self - def set_quantize_io(self, quantize_io: bool = False): + def set_quantize_io(self, quantize_io: bool = False) -> "ArmCompileSpecBuilder": """ Quantization of inputs and dequantization of outputs for cases where whole graph is quantized and method signature is not of quantized type. @@ -117,7 +123,7 @@ def set_quantize_io(self, quantize_io: bool = False): self.quantize_io = quantize_io return self - def build(self): + def build(self) -> List[CompileSpec]: """ Generate a list of compile spec objects from the builder """ @@ -159,13 +165,24 @@ def is_tosa(compile_spec: List[CompileSpec]) -> bool: return False -def get_intermediate_path(compile_spec: List[CompileSpec]) -> str: +def get_intermediate_path(compile_spec: List[CompileSpec]) -> Optional[str]: for spec in compile_spec: if spec.key == "debug_artifact_path": return spec.value.decode() return None +def _get_first_delegation_tag(graph_module) -> str | None: + """Get the first delegation tag from the graph_module or return None.""" + for node in graph_module.graph.nodes: + tag = node.meta.get("delegation_tag") + if tag: + return tag + + logger.debug("No delegation tag found in partition.") + return None + + @final class ArmBackend(BackendDetails): @staticmethod @@ -220,8 +237,13 @@ def preprocess( # noqa: C901 # TODO: It would be awesome if this dump could somehow be done on top level and not here. # Problem is that the desc.json has to be created on the tosa_graph object, which we can't # access from top level. - if artifact_path is not None: - dbg_tosa_dump(tosa_graph, artifact_path) + if artifact_path: + tag = _get_first_delegation_tag(graph_module) + dbg_tosa_dump( + tosa_graph, + artifact_path, + suffix="{}".format(f"_{tag}" if tag else ""), + ) # Serialize and return the program. While we have always produced TOSA # output as an intermediate, some flows compile to device binaries in diff --git a/backends/arm/arm_partitioner.py b/backends/arm/arm_partitioner.py index f73d97480bc..6b57c3d9658 100644 --- a/backends/arm/arm_partitioner.py +++ b/backends/arm/arm_partitioner.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import logging import operator import os @@ -39,10 +41,14 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: exir_ops.edge.aten.add.Tensor, exir_ops.edge.aten.addmm.default, exir_ops.edge.aten.expand_copy.default, + exir_ops.edge.aten.cat.default, + exir_ops.edge.aten.bmm.default, exir_ops.edge.aten.permute_copy.default, exir_ops.edge.aten.hardtanh.default, exir_ops.edge.aten.convolution.default, exir_ops.edge.aten.div.Tensor, + exir_ops.edge.aten.exp.default, + exir_ops.edge.aten.log.default, exir_ops.edge.aten.split_with_sizes_copy.default, exir_ops.edge.aten.full.default, exir_ops.edge.aten.mul.Tensor, @@ -51,12 +57,14 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: exir_ops.edge.aten.sigmoid.default, exir_ops.edge.aten.mm.default, exir_ops.edge.aten.repeat.default, + exir_ops.edge.aten.relu.default, exir_ops.edge.aten._softmax.default, exir_ops.edge.aten.slice_copy.Tensor, exir_ops.edge.aten.sub.Tensor, exir_ops.edge.aten.view_copy.default, exir_ops.edge.aten.clone.default, exir_ops.edge.aten.mean.dim, + exir_ops.edge.aten.unsqueeze_copy.default, operator.getitem, exir_ops.edge.quantized_decomposed.quantize_per_tensor.default, exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default, diff --git a/backends/arm/arm_vela.py b/backends/arm/arm_vela.py index f387672b7b4..01bb8bd55e5 100644 --- a/backends/arm/arm_vela.py +++ b/backends/arm/arm_vela.py @@ -3,14 +3,16 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import os import struct -import subprocess import tempfile from typing import List import numpy as np +from ethosu.vela import vela # Pack either input or output tensor block, compose the related arrays into @@ -38,21 +40,22 @@ def vela_compile(tosa_graph, args: List[str]): with tempfile.TemporaryDirectory() as tmpdir: tosaname = "out.tosa" flatbuffer = tosa_graph.serialize() - with open(os.path.join(tmpdir, tosaname), "wb") as f: + tosa_path = os.path.join(tmpdir, tosaname) + with open(tosa_path, "wb") as f: f.write(flatbuffer) # invoke vela - vela_command = f"cd {tmpdir}; vela {' '.join(args)} {tosaname}" - try: - subprocess.run([vela_command], shell=True, check=True, capture_output=True) - except subprocess.CalledProcessError as process_error: - raise RuntimeError( - f"Vela compiler ('{vela_command}') failed with error:\n \ - {process_error.stderr.decode()}\n \ - Stdout:\n{process_error.stdout.decode()}" - ) - - np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz") + output_dir = os.path.join(tmpdir, "output") + args.append(f"--output-dir={output_dir}") + args.append(tosa_path) + vela.main(" ".join(args).split(" ")) + + if any("ethos-u85" in arg for arg in args) or any( + "debug-force-regor" in arg for arg in args + ): + np_path = os.path.join(tmpdir, "output", "out_vela.npz") + else: + np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz") blocks = b"" with np.load(np_path, allow_pickle=False) as data: diff --git a/backends/arm/operators/TARGETS b/backends/arm/operators/TARGETS new file mode 100644 index 00000000000..fd04d5fb847 --- /dev/null +++ b/backends/arm/operators/TARGETS @@ -0,0 +1,34 @@ +load("@fbcode_macros//build_defs:python_library.bzl", "python_library") + +python_library( + name = "node_visitor", + srcs = ["node_visitor.py"], + typing = True, + deps = [ + "//executorch/backends/arm:tosa_mapping", + ], +) + +python_library( + name = "ops", + srcs = glob(["op_*.py"]), + typing = True, + deps = [ + "fbsource//third-party/serialization_lib/python/tosa:tosa", + ":node_visitor", + "//executorch/backends/arm:tosa_mapping", + "//executorch/backends/arm:tosa_quant_utils", + "//executorch/backends/arm:tosa_utils", + "//executorch/exir:lib", + ], +) + +python_library( + name = "lib", + srcs = ["__init__.py"], + typing = True, + deps = [ + ":node_visitor", + ":ops", + ], +) diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py index 94a16d8c941..7b94bfa837d 100644 --- a/backends/arm/operators/__init__.py +++ b/backends/arm/operators/__init__.py @@ -3,27 +3,35 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from . import ( # noqa node_visitor, op_add, op_addmm, op_avg_pool2d, op_batch_norm, + op_bmm, + op_cat, op_conv2d, op_dequant, op_div, + op_exp, op_full, op_get_item, op_hardtanh, + op_log, op_mean_dim, op_mm, op_mul, op_permute, op_quant, + op_relu, op_repeat, op_sigmoid, op_slice, op_softmax, op_sub, + op_unsqueeze, op_view, ) diff --git a/backends/arm/operators/node_visitor.py b/backends/arm/operators/node_visitor.py index 59edc01e745..99fd0388e45 100644 --- a/backends/arm/operators/node_visitor.py +++ b/backends/arm/operators/node_visitor.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import Dict, List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_add.py b/backends/arm/operators/op_add.py index 33c0c49744b..ec2ade9e8ad 100644 --- a/backends/arm/operators/op_add.py +++ b/backends/arm/operators/op_add.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import List import executorch.backends.arm.tosa_quant_utils as tqutils diff --git a/backends/arm/operators/op_addmm.py b/backends/arm/operators/op_addmm.py index 4a0581376c2..b4f782db4a3 100644 --- a/backends/arm/operators/op_addmm.py +++ b/backends/arm/operators/op_addmm.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_avg_pool2d.py b/backends/arm/operators/op_avg_pool2d.py index e6d07610c81..4caaad92028 100644 --- a/backends/arm/operators/op_avg_pool2d.py +++ b/backends/arm/operators/op_avg_pool2d.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_batch_norm.py b/backends/arm/operators/op_batch_norm.py index c41941722b3..d17c3a1b81f 100644 --- a/backends/arm/operators/op_batch_norm.py +++ b/backends/arm/operators/op_batch_norm.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_bmm.py b/backends/arm/operators/op_bmm.py new file mode 100644 index 00000000000..161b5d22396 --- /dev/null +++ b/backends/arm/operators/op_bmm.py @@ -0,0 +1,85 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe +from typing import List + +import serializer.tosa_serializer as ts +import torch.fx +from executorch.backends.arm.operators.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.arm.tosa_mapping import TosaArg +from executorch.backends.arm.tosa_quant_utils import build_rescale, get_quant_node_args +from executorch.backends.arm.tosa_utils import get_two_inputs +from serializer.tosa_serializer import TosaOp + + +@register_node_visitor +class BMMVisitor(NodeVisitor): + target = "aten.bmm.default" + + def __init__(self, *args): + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + tosa_graph: ts.TosaSerializer, + inputs: List[TosaArg], + output: TosaArg, + is_quant_node: bool, + ) -> None: + input0, input1 = get_two_inputs(node) + + # aten.bmm maps directly to MATMUL + # NOTE: For now, only INT8 & FP32 is supported + + # For INT8, we need to get the zero points and add an intermediate tensor + # for a later rescale. + if is_quant_node: + input0_zp = get_quant_node_args(input0).zp + input1_zp = get_quant_node_args(input1).zp + bmm_result = tosa_graph.addIntermediate(output.shape, ts.DType.INT32) + bmm_output_name = bmm_result.name + else: + input0_zp, input1_zp = 0, 0 + bmm_output_name = output.name + + # Add the MATMUL to the TOSA graph. + attr = ts.TosaSerializerAttribute() + attr.MatMulAttribute(A_zp=input0_zp, B_zp=input1_zp) + + tosa_graph.addOperator( + TosaOp.Op().MATMUL, + [input0.name, input1.name], + [bmm_output_name], + attr, + ) + + # As INT8 accumulates into INT32, we need to rescale it back to INT8 + if is_quant_node: + input0_q_params = get_quant_node_args(input0) + input1_q_params = get_quant_node_args(input1) + output_q_params = get_quant_node_args(list(node.users)[0]) + + final_output_scale = ( + input0_q_params.scale * input1_q_params.scale + ) / output_q_params.scale + + build_rescale( + tosa_fb=tosa_graph, + scale=final_output_scale, + # pyre-ignore[61]: Uninitialized local [61]: Local variable `bmm_result` is undefined, or not always defined. + input_node=bmm_result, + output_name=output.name, + output_type=ts.DType.INT8, + output_shape=bmm_result.shape, + input_zp=0, + output_zp=output_q_params.zp, + is_double_round=False, + ) diff --git a/backends/arm/operators/op_cat.py b/backends/arm/operators/op_cat.py new file mode 100644 index 00000000000..652eb397371 --- /dev/null +++ b/backends/arm/operators/op_cat.py @@ -0,0 +1,47 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +from typing import List + +import serializer.tosa_serializer as ts +from executorch.backends.arm.operators.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.arm.tosa_mapping import TosaArg +from serializer.tosa_serializer import TosaOp +from torch.fx import Node + + +@register_node_visitor +class CatVisitor(NodeVisitor): + target = "aten.cat.default" + + def __init__(self, *args): + super().__init__(*args) + + def define_node( + self, + node: Node, + tosa_graph: ts.TosaSerializer, + inputs: List[TosaArg], + output: TosaArg, + is_quant_node: bool, + ) -> None: + + tensors = inputs[0].special + dim = 0 if len(inputs) < 2 else inputs[1].number + rank = len(output.shape) + dim = (dim + rank) % rank + dim = output.dim_order.index(dim) + + attr = ts.TosaSerializerAttribute() + attr.AxisAttribute(dim) + + tosa_graph.addOperator( + TosaOp.Op().CONCAT, [tensor.name for tensor in tensors], [output.name], attr + ) diff --git a/backends/arm/operators/op_conv2d.py b/backends/arm/operators/op_conv2d.py index 323b11601cb..64cde0724f5 100644 --- a/backends/arm/operators/op_conv2d.py +++ b/backends/arm/operators/op_conv2d.py @@ -2,7 +2,9 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from typing import List + +# pyre-unsafe +from typing import cast, List import serializer.tosa_serializer as ts import torch @@ -40,7 +42,7 @@ def adjust_pad_if_needed(self, input, weight, stride, pad, dilation): if mod_remainder > pad: raise RuntimeError( - f"ignoring input element is not currently supported, got a large stride {stride}" + "This case should be handled by the SizeAdjustConv2d pass, is it enabled?" ) return pad - mod_remainder @@ -156,11 +158,12 @@ def define_node( # integer value domain of the next op. Otherwise return float32 output. if is_quant_node: # Get scale_factor from input, weight, and output. - _, input_scale, _, _, _, _ = getNodeArgs(node.args[0]) - _, weight_scale, _, _, _, _ = getNodeArgs(node.args[1]) + _, input_scale, _, _, _, _ = getNodeArgs(cast(torch.fx.Node, node.args[0])) + _, weight_scale, _, _, _, _ = getNodeArgs(cast(torch.fx.Node, node.args[1])) _, output_scale, output_zp, _, _, _ = getNodeArgs(list(node.users)[0]) build_rescale_conv_output( tosa_graph, + # pyre-fixme[61]: Uninitialized local [61]: Local variable `conv2d_res` is undefined, or not always defined. conv2d_res, output.name, actual_out_type, diff --git a/backends/arm/operators/op_dequant.py b/backends/arm/operators/op_dequant.py index 269afceccb7..afa1dda9467 100644 --- a/backends/arm/operators/op_dequant.py +++ b/backends/arm/operators/op_dequant.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_div.py b/backends/arm/operators/op_div.py index e365cf6cfe2..0857e0ed32a 100644 --- a/backends/arm/operators/op_div.py +++ b/backends/arm/operators/op_div.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_exp.py b/backends/arm/operators/op_exp.py new file mode 100644 index 00000000000..f98bb3f88c2 --- /dev/null +++ b/backends/arm/operators/op_exp.py @@ -0,0 +1,83 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe +from typing import List + +import numpy as np + +import serializer.tosa_serializer as ts +from executorch.backends.arm.operators.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.arm.tosa_mapping import TosaArg + +from executorch.backends.arm.tosa_quant_utils import ( + dequantize_value, + get_quant_node_args, + QuantArgs, + quantize_value, +) +from serializer.tosa_serializer import TosaOp +from torch.fx import Node + + +@register_node_visitor +class ExpVisitor(NodeVisitor): + target = "aten.exp.default" + + def __init__(self, *args): + super().__init__(*args) + + def define_node( + self, + node: Node, + tosa_graph: ts.TosaSerializer, + inputs: List[TosaArg], + output: TosaArg, + is_quant_node: bool, + ) -> None: + + assert len(node.all_input_nodes) == 1 + assert len(node.users) == 1 + + if is_quant_node: + # Assume quantized input is 8 bit. + + # Create attribute for 8 bit table lookup. + input_node = node.all_input_nodes[0] + in_quantargs = get_quant_node_args(input_node) + output_node = list(node.users)[0] + out_quantargs = get_quant_node_args(output_node) + + table = exp_table_8bit(in_quantargs, out_quantargs) + table_attr = ts.TosaSerializerAttribute() + table_attr.TableAttribute(table) + + tosa_graph.addOperator( + TosaOp.Op().TABLE, [inputs[0].name], [output.name], table_attr + ) + else: + tosa_graph.addOperator(TosaOp.Op().EXP, [inputs[0].name], [output.name]) + + +def exp_table_8bit(in_quantargs: QuantArgs, out_quantargs: QuantArgs): + """ + Returns a table mapping 256 entries to exp([qmin,qmax]) + """ + + def exp(x): + # Convert quantized input to floating point exp input space. + v = dequantize_value(x, in_quantargs) + # Compute exp. + v = np.exp(v) + # Convert exp output back to quantized space. + return quantize_value(v, out_quantargs) + + return [ + exp(x) + for x in np.linspace(in_quantargs.qmin, in_quantargs.qmax, 256, dtype=np.int8) + ] diff --git a/backends/arm/operators/op_full.py b/backends/arm/operators/op_full.py index f929b02ee67..eec27bb9090 100644 --- a/backends/arm/operators/op_full.py +++ b/backends/arm/operators/op_full.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import numpy as np diff --git a/backends/arm/operators/op_get_item.py b/backends/arm/operators/op_get_item.py index 59004f49686..a696b33aa75 100644 --- a/backends/arm/operators/op_get_item.py +++ b/backends/arm/operators/op_get_item.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_hardtanh.py b/backends/arm/operators/op_hardtanh.py index 3d58f6d628c..62c0a27f05f 100644 --- a/backends/arm/operators/op_hardtanh.py +++ b/backends/arm/operators/op_hardtanh.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_log.py b/backends/arm/operators/op_log.py new file mode 100644 index 00000000000..5276173efa3 --- /dev/null +++ b/backends/arm/operators/op_log.py @@ -0,0 +1,83 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe +from typing import List + +import numpy as np + +import serializer.tosa_serializer as ts +from executorch.backends.arm.operators.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.arm.tosa_mapping import TosaArg + +from executorch.backends.arm.tosa_quant_utils import ( + dequantize_value, + get_quant_node_args, + QuantArgs, + quantize_value, +) +from serializer.tosa_serializer import TosaOp +from torch.fx import Node + + +@register_node_visitor +class LogVisitor(NodeVisitor): + target = "aten.log.default" + + def __init__(self, *args): + super().__init__(*args) + + def define_node( + self, + node: Node, + tosa_graph: ts.TosaSerializer, + inputs: List[TosaArg], + output: TosaArg, + is_quant_node: bool, + ) -> None: + + assert len(node.all_input_nodes) == 1 + assert len(node.users) == 1 + + if is_quant_node: + # Assume quantized input is 8 bit. + + # Create attribute for 8 bit table lookup. + input_node = node.all_input_nodes[0] + in_quantargs = get_quant_node_args(input_node) + output_node = list(node.users)[0] + out_quantargs = get_quant_node_args(output_node) + + table = log_table_8bit(in_quantargs, out_quantargs) + table_attr = ts.TosaSerializerAttribute() + table_attr.TableAttribute(table) + + tosa_graph.addOperator( + TosaOp.Op().TABLE, [inputs[0].name], [output.name], table_attr + ) + else: + tosa_graph.addOperator(TosaOp.Op().LOG, [inputs[0].name], [output.name]) + + +def log_table_8bit(in_quantargs: QuantArgs, out_quantargs: QuantArgs): + """ + Returns a table mapping 256 entries to log([qmin,qmax]) + """ + + def log(x): + # Convert quantized input to floating point log input space. + v = dequantize_value(x, in_quantargs) + # Compute log. + v = np.log(v) + # Convert log output back to quantized space. + return quantize_value(v, out_quantargs) + + return [ + log(x) + for x in np.linspace(in_quantargs.qmin, in_quantargs.qmax, 256, dtype=np.int8) + ] diff --git a/backends/arm/operators/op_mean_dim.py b/backends/arm/operators/op_mean_dim.py index 20e1b2b8d76..3c9aea30856 100644 --- a/backends/arm/operators/op_mean_dim.py +++ b/backends/arm/operators/op_mean_dim.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts @@ -11,7 +13,6 @@ register_node_visitor, ) from executorch.backends.arm.tosa_mapping import TosaArg -from executorch.backends.arm.tosa_utils import build_avg_pool_2d_common @register_node_visitor @@ -30,29 +31,4 @@ def define_node( is_quant_node: bool, ) -> None: - input_tensor = inputs[0] - dim = node.args[1] - keep_dim = node.args[2] - - # mean.dim(-1, -2) is the same as avg_pool2d when just computing mean over HW dimensions. - # Since tosa doesn't have mean.dim operation, lowers it to average pooling instead. - if dim == [-1, -2]: - if keep_dim is True: - # Given the shape format of input is (N, C, H, W) - kernel_size = [input_tensor.shape[2], input_tensor.shape[3]] - stride = [1, 1] - padding = [0, 0, 0, 0] - - build_avg_pool_2d_common( - node, - tosa_graph, - input_tensor, - kernel_size, - stride, - padding, - is_quant_node, - output, - ) - return - raise AssertionError("unsupported") diff --git a/backends/arm/operators/op_mm.py b/backends/arm/operators/op_mm.py index f7097022f12..ebddb3a40e2 100644 --- a/backends/arm/operators/op_mm.py +++ b/backends/arm/operators/op_mm.py @@ -3,6 +3,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts @@ -96,6 +98,7 @@ def define_node( build_rescale( tosa_fb=tosa_graph, scale=final_output_scale, + # pyre-ignore[61]: Uninitialized local [61]: Local variable `reshape_intermediate` is undefined, or not always defined. input_node=reshape_intermediate, output_name=output.name, output_type=ts.DType.INT8, diff --git a/backends/arm/operators/op_mul.py b/backends/arm/operators/op_mul.py index e9cbfcbd7cc..c152e8759ef 100644 --- a/backends/arm/operators/op_mul.py +++ b/backends/arm/operators/op_mul.py @@ -3,7 +3,9 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from typing import List +# pyre-unsafe + +from typing import cast, List import executorch.backends.arm.tosa_quant_utils as tqutils import executorch.backends.arm.tosa_utils as tutils @@ -35,8 +37,12 @@ def define_node( if is_quant_node: input_A = inputs[0] input_B = inputs[1] - input_A_qargs = tqutils.get_quant_node_args(node.args[0]) - input_B_qargs = tqutils.get_quant_node_args(node.args[1]) + input_A_qargs = tqutils.get_quant_node_args( + cast(torch.fx.Node, node.args[0]) + ) + input_B_qargs = tqutils.get_quant_node_args( + cast(torch.fx.Node, node.args[1]) + ) input_A.shape = tutils.tosa_shape(input_A.shape, input_A.dim_order) input_B.shape = tutils.tosa_shape(input_B.shape, input_B.dim_order) diff --git a/backends/arm/operators/op_output.py b/backends/arm/operators/op_output.py index 7d163114aa8..1b053b18edc 100644 --- a/backends/arm/operators/op_output.py +++ b/backends/arm/operators/op_output.py @@ -3,6 +3,10 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + +from typing import cast + import serializer.tosa_serializer as ts import torch @@ -11,7 +15,7 @@ def process_output( node: torch.fx.Node, tosa_graph: ts.TosaSerializer, ): - for output in node.args[0]: + for output in cast(tuple[torch.fx.Node, ...], node.args[0]): tosa_graph.addOutputTensor( tosa_graph.currRegion.currBasicBlock.tensors[output.name] ) diff --git a/backends/arm/operators/op_permute.py b/backends/arm/operators/op_permute.py index eafd6af3678..167a0c382f4 100644 --- a/backends/arm/operators/op_permute.py +++ b/backends/arm/operators/op_permute.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_placeholder.py b/backends/arm/operators/op_placeholder.py index 0b2e65f45d0..b5dcf3f9873 100644 --- a/backends/arm/operators/op_placeholder.py +++ b/backends/arm/operators/op_placeholder.py @@ -3,9 +3,11 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import numpy as np import serializer.tosa_serializer as ts -import torch +import torch.fx from executorch.backends.arm.tosa_mapping import TosaArg from executorch.backends.arm.tosa_quant_utils import ( get_quant_arg_dtype, @@ -130,6 +132,21 @@ def process_inputs_to_buffers( ) +def process_inputs_to_lifted_tensor_constants( + node: torch.fx.Node, + tosa_graph: ts.TosaSerializer, + edge_program: ExportedProgram, +): + arg = TosaArg(node) + tensor_name = edge_program.graph_signature.inputs_to_lifted_tensor_constants[ + arg.name + ] + tensor = edge_program.tensor_constants[tensor_name] + tensor_data = tensor.detach().numpy() + + tosa_graph.addConst(tensor_data.shape, arg.dtype, tensor_data, name=arg.name) + + def process_placeholder( node: torch.fx.Node, tosa_graph: ts.TosaSerializer, @@ -145,5 +162,11 @@ def process_placeholder( process_inputs_to_parameters(node, tosa_graph, edge_program) elif node.name in edge_program.graph_signature.inputs_to_buffers: process_inputs_to_buffers(node, tosa_graph, edge_program) + elif node.name in edge_program.graph_signature.inputs_to_lifted_tensor_constants: + process_inputs_to_lifted_tensor_constants(node, tosa_graph, edge_program) + elif node.name in edge_program.graph_signature.inputs_to_lifted_custom_objs: + raise NotImplementedError( + "Placeholder is of type 'lifted custom object' which is not supported." + ) else: - raise RuntimeError(f"Unknown placeholder {node.name}") + raise RuntimeError(f"Placeholder '{node.name}' is of unknown type.") diff --git a/backends/arm/operators/op_quant.py b/backends/arm/operators/op_quant.py index e6a62b3f206..8f83e79442d 100644 --- a/backends/arm/operators/op_quant.py +++ b/backends/arm/operators/op_quant.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_relu.py b/backends/arm/operators/op_relu.py new file mode 100644 index 00000000000..20bba3f6545 --- /dev/null +++ b/backends/arm/operators/op_relu.py @@ -0,0 +1,57 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +import executorch.backends.arm.tosa_quant_utils as tqutils +import serializer.tosa_serializer as ts +import torch.fx +from executorch.backends.arm.operators.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.arm.tosa_mapping import TosaArg +from serializer.tosa_serializer import TosaOp + + +@register_node_visitor +class ReluVisitor(NodeVisitor): + target = "aten.relu.default" + + def __init__(self, *args): + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + tosa_graph: ts.TosaSerializer, + inputs: list[TosaArg], + output: TosaArg, + is_quant_node: bool, + ) -> None: + attr = ts.TosaSerializerAttribute() + + clamp_min_fp = 0.0 + clamp_max_fp = 0.0 + clamp_min_qs = 0 + clamp_max_qs = 0 + if is_quant_node: + out_qargs = tqutils.get_quant_node_args(list(node.users)[0]) + clamp_min_qs = tqutils.quantize_value(0, out_qargs) + clamp_max_qs = tqutils.quantize_value(float("inf"), out_qargs) + + else: + clamp_min_fp = 0 + clamp_max_fp = float("inf") + + attr.ClampAttribute( + tosa_graph.builder, + clamp_min_qs, + clamp_max_qs, + clamp_min_fp, + clamp_max_fp, + ) + + tosa_graph.addOperator(TosaOp.Op().CLAMP, [inputs[0].name], [output.name], attr) diff --git a/backends/arm/operators/op_repeat.py b/backends/arm/operators/op_repeat.py index 261fcca12e7..20de9e0846a 100644 --- a/backends/arm/operators/op_repeat.py +++ b/backends/arm/operators/op_repeat.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import serializer.tosa_serializer as ts import torch from executorch.backends.arm.operators.node_visitor import ( diff --git a/backends/arm/operators/op_sigmoid.py b/backends/arm/operators/op_sigmoid.py index 884c803482b..0087b1f7a81 100644 --- a/backends/arm/operators/op_sigmoid.py +++ b/backends/arm/operators/op_sigmoid.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import numpy as np diff --git a/backends/arm/operators/op_slice.py b/backends/arm/operators/op_slice.py index e562e0724e2..0dfb287cd75 100644 --- a/backends/arm/operators/op_slice.py +++ b/backends/arm/operators/op_slice.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import List import serializer.tosa_serializer as ts diff --git a/backends/arm/operators/op_softmax.py b/backends/arm/operators/op_softmax.py index 627fa64aed1..1ac42413189 100644 --- a/backends/arm/operators/op_softmax.py +++ b/backends/arm/operators/op_softmax.py @@ -2,6 +2,8 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts @@ -33,7 +35,7 @@ def define_node( input_name = inputs[0].name dim_order = inputs[0].dim_order input_shape = tosa_shape(inputs[0].shape, dim_order) - dim_value = dim_order.index(inputs[1].number) + dim_value = dim_order.index(inputs[1].number % len(dim_order)) ## softmax = exp(logits - max(logits)) / reduce_sum(exp(logits - max(logits)), -1) # FP32 diff --git a/backends/arm/operators/op_sub.py b/backends/arm/operators/op_sub.py index 3dc1519f370..2089b6e9e96 100644 --- a/backends/arm/operators/op_sub.py +++ b/backends/arm/operators/op_sub.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import List import executorch.backends.arm.tosa_quant_utils as tqutils diff --git a/backends/arm/operators/op_unsqueeze.py b/backends/arm/operators/op_unsqueeze.py new file mode 100644 index 00000000000..c14128fdc8c --- /dev/null +++ b/backends/arm/operators/op_unsqueeze.py @@ -0,0 +1,53 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +# +# Follows this specification: https://pytorch.org/docs/stable/generated/torch.unsqueeze.html + +# pyre-unsafe + +import serializer.tosa_serializer as ts +import torch.fx +from executorch.backends.arm.operators.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.arm.tosa_mapping import TosaArg +from executorch.backends.arm.tosa_utils import tosa_shape +from serializer.tosa_serializer import TosaOp + + +@register_node_visitor +class UnsqueezeVisitor(NodeVisitor): + target = "aten.unsqueeze_copy.default" + + def __init__(self, *args): + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + tosa_graph: ts.TosaSerializer, + inputs: list[TosaArg], + output: TosaArg, + is_quant_node: bool, + ) -> None: + + dim = inputs[1].number + shape = inputs[0].shape + rank = len(shape) + + assert -rank - 1 <= dim < rank + 1 + if dim < 0: + dim = dim + rank + 1 + + new_shape = list(shape) + new_shape.insert(dim, 1) + new_shape = tosa_shape(new_shape, output.dim_order) + + attr = ts.TosaSerializerAttribute() + attr.ReshapeAttribute(new_shape) + tosa_graph.addOperator( + TosaOp.Op().RESHAPE, [inputs[0].name], [output.name], attr + ) diff --git a/backends/arm/operators/op_view.py b/backends/arm/operators/op_view.py index 682eacd5e38..8667df590dc 100644 --- a/backends/arm/operators/op_view.py +++ b/backends/arm/operators/op_view.py @@ -2,10 +2,13 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +# pyre-unsafe from typing import List import serializer.tosa_serializer as ts import torch +import tosa.Op as TosaOp from executorch.backends.arm.operators.node_visitor import ( NodeVisitor, @@ -13,7 +16,6 @@ ) from executorch.backends.arm.tosa_mapping import TosaArg from executorch.backends.arm.tosa_utils import tosa_shape -from serializer.tosa_serializer import TosaOp @register_node_visitor diff --git a/backends/arm/passes/TARGETS b/backends/arm/passes/TARGETS new file mode 100644 index 00000000000..ca20b03fccd --- /dev/null +++ b/backends/arm/passes/TARGETS @@ -0,0 +1,12 @@ +load("@fbcode_macros//build_defs:python_library.bzl", "python_library") + +python_library( + name = "passes", + srcs = glob(["*.py"]), + typing = True, + deps = [ + "//executorch/backends/arm:tosa_quant_utils", + "//executorch/backends/arm:tosa_utils", + "//executorch/exir:lib", + ], +) diff --git a/backends/arm/passes/annotate_channels_last_dim_order_pass.py b/backends/arm/passes/annotate_channels_last_dim_order_pass.py index 9bb45c504a4..222c0a7cb36 100644 --- a/backends/arm/passes/annotate_channels_last_dim_order_pass.py +++ b/backends/arm/passes/annotate_channels_last_dim_order_pass.py @@ -4,6 +4,10 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + +from typing import cast + import torch from executorch.backends.arm.tosa_quant_utils import dq_op from executorch.backends.arm.tosa_utils import is_consumer_node_depthwise_conv2d @@ -28,9 +32,11 @@ def is_weight_node_for_depthwise_conv2d(self, node: torch.fx.Node): if node.target != dq_op: return False prev_node = node.args[0] - if prev_node.op != "placeholder": + if cast(torch.fx.Node, prev_node).op != "placeholder": return False - return is_consumer_node_depthwise_conv2d(node) + if is_consumer_node_depthwise_conv2d(node): + consumer_node = list(node.users)[0] + return consumer_node.args[1] == node elif node.op == "placeholder": # node is an input, weight or bias node consumer_node = list(node.users)[0] @@ -46,7 +52,9 @@ def call(self, graph_module: torch.fx.GraphModule): NHWC_Order = (0, 2, 3, 1) HWCM_Order = (2, 3, 0, 1) for node in graph_module.graph.nodes: - if isinstance(node.meta["val"], tuple): + if isinstance( + node.meta["val"], (tuple, torch.fx.immutable_collections.immutable_list) + ): node_data = node.meta["val"][0].data else: node_data = node.meta["val"].data diff --git a/backends/arm/passes/arm_pass_manager.py b/backends/arm/passes/arm_pass_manager.py index 054d823dbbb..75ef551171e 100644 --- a/backends/arm/passes/arm_pass_manager.py +++ b/backends/arm/passes/arm_pass_manager.py @@ -5,6 +5,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import torch from executorch.backends.arm.passes.annotate_channels_last_dim_order_pass import ( AnnotateChannelsLastDimOrder, @@ -15,22 +17,28 @@ from executorch.backends.arm.passes.convert_split_to_slice import ( ConvertSplitToSlicePass, ) +from executorch.backends.arm.passes.meandim_to_averagepool_pass import ( + ConvertMeanDimToAveragePool, +) from executorch.backends.arm.passes.remove_clone_pass import RemoveClonePass +from executorch.backends.arm.passes.size_adjust_conv2d_pass import SizeAdjustConv2DPass from executorch.exir.backend.compile_spec_schema import CompileSpec from executorch.exir.pass_manager import PassManager class ArmPassManager(PassManager): - def _transform(self, graph_module: torch.fx.Graph): + def _transform(self, graph_module: torch.fx.GraphModule): return self(graph_module).graph_module def transform_to_backend_pipeline( - self, graph_module: torch.fx.Graph, compile_spec: CompileSpec + self, graph_module: torch.fx.GraphModule, compile_spec: list[CompileSpec] ): """Apply passes before transforming program to backend""" + self.add_pass(SizeAdjustConv2DPass()) self.add_pass(RemoveClonePass()) self.add_pass(ConvertExpandCopyToRepeatPass()) + self.add_pass(ConvertMeanDimToAveragePool()) self.add_pass(ConvertSplitToSlicePass()) for spec in compile_spec: if spec.key == "permute_memory_format": diff --git a/backends/arm/passes/convert_expand_copy_to_repeat.py b/backends/arm/passes/convert_expand_copy_to_repeat.py index 53138682d56..249c014ae67 100644 --- a/backends/arm/passes/convert_expand_copy_to_repeat.py +++ b/backends/arm/passes/convert_expand_copy_to_repeat.py @@ -4,6 +4,10 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + +from typing import cast + import torch.fx from executorch.backends.arm.tosa_mapping import extract_tensor_meta from executorch.exir.dialects._ops import ops as exir_ops @@ -31,7 +35,7 @@ def call(self, graph_module: torch.fx.GraphModule): expand_node = src_partition.nodes[0] _, shape, _ = extract_tensor_meta(expand_node.all_input_nodes[0].meta) - multiples = expand_node.args[1] + multiples = cast(tuple[int], expand_node.args[1]) expanded_rank = len(multiples) # Expanded shape is 'shape' front-padded with ones. diff --git a/backends/arm/passes/convert_split_to_slice.py b/backends/arm/passes/convert_split_to_slice.py index ff978d4d9ec..29aae37fe9e 100644 --- a/backends/arm/passes/convert_split_to_slice.py +++ b/backends/arm/passes/convert_split_to_slice.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import torch.fx from executorch.backends.arm.tosa_mapping import extract_tensor_meta from executorch.exir.dialects._ops import ops as exir_ops diff --git a/backends/arm/passes/meandim_to_averagepool_pass.py b/backends/arm/passes/meandim_to_averagepool_pass.py new file mode 100644 index 00000000000..0974eac740c --- /dev/null +++ b/backends/arm/passes/meandim_to_averagepool_pass.py @@ -0,0 +1,54 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +from typing import Any, cast, Dict, Tuple + +import torch.fx + +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, NodeMetadata, ProxyValue + +Argument = Any + + +class ConvertMeanDimToAveragePool(ExportPass): + """ + Replace a mean operation with dim = [-1, -2] and keep_dim = True with an average pool operation. + """ + + def call_operator( + self, + op: torch.fx.node.Target, + args: Tuple[Argument, ...], + kwargs: Dict[str, Argument], + meta: NodeMetadata, + ) -> ProxyValue: + if op != exir_ops.edge.aten.mean.dim: + return super().call_operator(op, args, kwargs, meta) + + input_value = cast(ProxyValue, args[0]) + dim = cast(list, args[1]) + keep_dim = cast(bool, args[2]) if len(args) > 2 else False + + # averagepool2d gets converted to a mean operation with dim = [-1, -2] and keep_dim = True + # so check the dim argument for this case + if dim == [-1, -2] and keep_dim is True: + # Given the shape format of input is (N, C, H, W) + kernel_size = [ + input_value.to_tensor().size()[2], + input_value.to_tensor().size()[3], + ] + stride = [1, 1] + return super().call_operator( + exir_ops.edge.aten.avg_pool2d.default, + (input_value, kernel_size, stride), + {}, + meta, + ) + else: + return super().call_operator(op, args, kwargs, meta) diff --git a/backends/arm/passes/remove_clone_pass.py b/backends/arm/passes/remove_clone_pass.py index 6108080cb0d..64a1ae8f43e 100644 --- a/backends/arm/passes/remove_clone_pass.py +++ b/backends/arm/passes/remove_clone_pass.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import torch from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult diff --git a/backends/arm/passes/size_adjust_conv2d_pass.py b/backends/arm/passes/size_adjust_conv2d_pass.py new file mode 100644 index 00000000000..980ab09e597 --- /dev/null +++ b/backends/arm/passes/size_adjust_conv2d_pass.py @@ -0,0 +1,131 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +from typing import cast, Optional + +import torch.fx +from executorch.backends.arm.tosa_quant_utils import is_quant_node +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, PassResult +from torch._ops import OpOverload + + +def conv_remainder(input_length, pad, dilation, weight, stride): + """ + Returns the size + """ + return (input_length + 2 * pad - dilation * (weight - 1) - 1) % stride + + +def insert_q_dq_pair( + graph: torch.fx.Graph, + anchor: torch.fx.Node, + q_params: tuple, +): + with graph.inserting_after(anchor): + q = create_node( + graph=graph, + op_target=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default, + args=(), # We add the argument last + ) + q.meta = anchor.meta + + with graph.inserting_after(q): + dq = create_node( + graph=graph, + op_target=exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default, + args=(q,) + q_params, + ) + dq.meta = q.meta + + anchor.replace_all_uses_with(dq) + # We add this last so the replace all uses above does not replace the quantized + # node's first use + q.args = (anchor,) + q_params + return dq + + +def create_node( + graph: torch.fx.Graph, + op_target: OpOverload, + args: tuple = (), + kwargs: Optional[dict] = None, +): + return graph.create_node( + "call_function", + op_target, + args=args, + kwargs=kwargs or {}, + ) + + +class SizeAdjustConv2DPass(ExportPass): + """ + Adjust the convolution input size to match perfectly with the + weight size, padding, stride and dilation parameters. + This is done by inserting a slice op to remove the uneven end of the input. + """ + + conv2d_op = exir_ops.edge.aten.convolution.default + slice_op = exir_ops.edge.aten.slice_copy.Tensor + + def call(self, graph_module: torch.fx.GraphModule): + graph = graph_module.graph + modified_graph = False + for node in graph.nodes: + if node.op != "call_function": + continue + if node.target != self.conv2d_op: + continue + + conv_node = cast(torch.fx.Node, node) + input_node, weight, _, stride_hw, pad_hw, dilation_hw, _, _, _ = ( + conv_node.args + ) + weight_shape = cast(torch.fx.Node, weight).meta["val"].shape + input_shape = cast(torch.fx.Node, input_node).meta["val"].shape + + slice_args = [] + for stride, pad, dilation, dim in zip( + cast(list, stride_hw), + cast(list, pad_hw), + cast(list, dilation_hw), + (2, 3), + ): + remainder = conv_remainder( + input_shape[dim], pad, dilation, weight_shape[dim], stride + ) + if remainder > pad: + adjustment = remainder - pad + args = (dim, 0, input_shape[dim] - adjustment) + slice_args.append(args) + if len(slice_args) == 0: + continue + + with graph_module.graph.inserting_before(node): + last_node = cast(torch.fx.Node, input_node) + for args in slice_args: + slice_node = graph.create_node( + "call_function", self.slice_op, (last_node,) + args + ) + if is_quant_node(last_node): + q_params = last_node.args[1:] + dq_node = insert_q_dq_pair( + graph_module.graph, slice_node, q_params + ) + last_node = dq_node + else: + last_node = slice_node + conv_node.replace_input_with(cast(torch.fx.Node, input_node), last_node) + modified_graph = True + + if modified_graph: + graph_module = super().call(graph_module).graph_module + graph.eliminate_dead_code() + graph_module.recompile() + return PassResult(graph_module, True) diff --git a/backends/arm/passes/tag_io_quant_pass.py b/backends/arm/passes/tag_io_quant_pass.py index d2bf74462ed..2fce6cf3fd4 100644 --- a/backends/arm/passes/tag_io_quant_pass.py +++ b/backends/arm/passes/tag_io_quant_pass.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import torch from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult diff --git a/backends/arm/quantizer/TARGETS b/backends/arm/quantizer/TARGETS new file mode 100644 index 00000000000..840586488bf --- /dev/null +++ b/backends/arm/quantizer/TARGETS @@ -0,0 +1,31 @@ +load("@fbcode_macros//build_defs:python_library.bzl", "python_library") + +python_library( + name = "arm_quantizer", + srcs = ["arm_quantizer.py"], + typing = True, + deps = [ + ":arm_quantizer_utils", + "//caffe2:torch", + "//executorch/backends/arm/quantizer/quantization_annotation:quantization_annotation", + "//executorch/exir:lib", + ], +) + +python_library( + name = "quantization_config", + srcs = ["quantization_config.py"], + typing = True, + deps = [ + "//caffe2:torch", + ], +) + +python_library( + name = "arm_quantizer_utils", + srcs = ["arm_quantizer_utils.py"], + typing = True, + deps = [ + ":quantization_config", + ], +) diff --git a/backends/arm/quantizer/arm_quantizer.py b/backends/arm/quantizer/arm_quantizer.py index 8d5edf386a0..853fd47c29c 100644 --- a/backends/arm/quantizer/arm_quantizer.py +++ b/backends/arm/quantizer/arm_quantizer.py @@ -5,6 +5,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + # # Quantizer for Arm backend # @@ -267,6 +269,8 @@ class ArmQuantizer(Quantizer): "mul", "sigmoid", "mm", + "cat", + "one_to_one", ] def __init__(self) -> None: diff --git a/backends/arm/quantizer/arm_quantizer_utils.py b/backends/arm/quantizer/arm_quantizer_utils.py index c5da32a40ad..fe9c5e34e6b 100644 --- a/backends/arm/quantizer/arm_quantizer_utils.py +++ b/backends/arm/quantizer/arm_quantizer_utils.py @@ -5,12 +5,14 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + # # Utility functions for ArmQuantizer # import operator -from typing import Callable, cast, List +from typing import Callable, cast, List, Union import torch from executorch.backends.arm.quantizer.quantization_config import QuantizationConfig @@ -72,7 +74,7 @@ def get_shared_qspec( Both outputs are None if one of the inputs is a node that can't be quantized. """ - input_act0 = node.args[0] + input_act0 = cast(Node, node.args[0]) input_act1 = node.args[1] input_act_qspec = quantization_config.get_input_act_qspec() @@ -102,12 +104,19 @@ def is_input_ok_for_quantization(input_act: Node, gm: GraphModule): ) +def get_node_target(module: torch.nn.Module | GraphModule, target_str: str): + targets = target_str.split(".") + for target in targets[:-1]: + module = module.get_submodule(target) + return getattr(module, targets[-1]) + + def is_input_large_scalar(node: Node, gm: GraphModule): """Check if input is a large scalar value. So that we can skip quantization for the node since histc op (in HistogramObserver) only works for values up to certain upper bound """ if node.op == "get_attr" and isinstance(node.target, str): - tensor = getattr(gm, node.target) + tensor = get_node_target(gm, node.target) # torch.histc works until this upper bound HISTC_UPPER_BOUND = 3.4028235e15 return tensor.numel() == 1 and abs(tensor.item()) > HISTC_UPPER_BOUND @@ -131,6 +140,7 @@ def is_share_obs_or_fq_op(op: Callable) -> bool: return op in [ torch.ops.aten.hardtanh.default, torch.ops.aten.hardtanh_.default, + torch.ops.aten.relu.default, torch.ops.aten.mean.default, torch.ops.aten.mean.dim, torch.ops.aten.permute.default, @@ -161,7 +171,9 @@ def propagate_annotation(model: GraphModule) -> None: n = cast(Node, n) if is_annotated(n): continue - if n.op != "call_function" or not is_share_obs_or_fq_op(n.target): + if n.op != "call_function" or not is_share_obs_or_fq_op( + cast(Callable, n.target) + ): continue prev_node = n.args[0] @@ -209,7 +221,7 @@ def convert_scalars_to_attrs(model: GraphModule) -> GraphModule: prefix = "_tensor_constant_" get_new_attr_name = get_new_attr_name_with_prefix(prefix) tensor_constant_name = get_new_attr_name(model) - float_tensor = torch.tensor(float(args[i])) + float_tensor = torch.tensor(float(cast(Union[int, float], args[i]))) model.register_buffer(tensor_constant_name, float_tensor) fake_mode = n.meta["val"].fake_mode with model.graph.inserting_before(n): diff --git a/backends/arm/quantizer/quantization_annotation/TARGETS b/backends/arm/quantizer/quantization_annotation/TARGETS new file mode 100644 index 00000000000..4ce8b5cad2c --- /dev/null +++ b/backends/arm/quantizer/quantization_annotation/TARGETS @@ -0,0 +1,12 @@ +load("@fbcode_macros//build_defs:python_library.bzl", "python_library") + +python_library( + name = "quantization_annotation", + srcs = glob(["*.py"]), + typing = True, + deps = [ + "//caffe2:torch", + "//executorch/backends/arm/quantizer:arm_quantizer_utils", + "//executorch/backends/arm/quantizer:quantization_config", + ], +) diff --git a/backends/arm/quantizer/quantization_annotation/__init__.py b/backends/arm/quantizer/quantization_annotation/__init__.py index 60808d2f234..f7219201dec 100644 --- a/backends/arm/quantizer/quantization_annotation/__init__.py +++ b/backends/arm/quantizer/quantization_annotation/__init__.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import Callable, Dict, List, NamedTuple, Optional @@ -49,11 +51,13 @@ def decorator(annotator: AnnotatorType): from . import ( # noqa adaptive_ang_pool2d_annotator, add_annotator, + cat_annotator, conv_annotator, linear_annotator, max_pool2d_annotator, mm_annotator, mul_annotator, + one_to_one_annotator, sigmoid_annotator, sub_annotator, ) diff --git a/backends/arm/quantizer/quantization_annotation/adaptive_ang_pool2d_annotator.py b/backends/arm/quantizer/quantization_annotation/adaptive_ang_pool2d_annotator.py index acbdc45b6b9..723a48f6644 100644 --- a/backends/arm/quantizer/quantization_annotation/adaptive_ang_pool2d_annotator.py +++ b/backends/arm/quantizer/quantization_annotation/adaptive_ang_pool2d_annotator.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import itertools from typing import Callable, List, Optional diff --git a/backends/arm/quantizer/quantization_annotation/add_annotator.py b/backends/arm/quantizer/quantization_annotation/add_annotator.py index 2926e92f243..35801bd5681 100644 --- a/backends/arm/quantizer/quantization_annotation/add_annotator.py +++ b/backends/arm/quantizer/quantization_annotation/add_annotator.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import itertools import operator from typing import Callable, List, Optional diff --git a/backends/arm/quantizer/quantization_annotation/cat_annotator.py b/backends/arm/quantizer/quantization_annotation/cat_annotator.py new file mode 100644 index 00000000000..6e138cd9def --- /dev/null +++ b/backends/arm/quantizer/quantization_annotation/cat_annotator.py @@ -0,0 +1,68 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +import itertools +from typing import Callable, cast, List, Optional + +import torch.fx +from executorch.backends.arm.quantizer import arm_quantizer_utils +from executorch.backends.arm.quantizer.quantization_annotation import register_annotator +from executorch.backends.arm.quantizer.quantization_config import QuantizationConfig +from torch.ao.quantization.quantizer import ( + QuantizationAnnotation, + SharedQuantizationSpec, +) +from torch.fx import Node +from torch.fx.passes.utils.source_matcher_utils import get_source_partitions + + +@register_annotator("cat") +def _annotate_cat( + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig, + filter_fn: Optional[Callable[[Node], bool]] = None, +) -> Optional[List[List[Node]]]: + cat_partitions = get_source_partitions(gm.graph, [torch.cat], filter_fn) + cat_partitions = list(itertools.chain.from_iterable(cat_partitions.values())) + annotated_partitions = [] + for cat_partition in cat_partitions: + annotated_partitions.append(cat_partition.nodes) + cat_node = cat_partition.output_nodes[0] + if arm_quantizer_utils.is_annotated(cat_node): + continue + + input_acts = cast(list[torch.fx.Node], cat_node.args[0]) + input_act0 = input_acts[0] + + input_act_qspec = quantization_config.get_input_act_qspec() + shared_with_input0_qspec = SharedQuantizationSpec((input_act0, cat_node)) + + input_qspec_map = {} + + # First input is set to input qspec from the quantization config. + if isinstance(input_act0, Node): + if not arm_quantizer_utils.is_input_ok_for_quantization(input_act0, gm): + continue + input_qspec_map[input_act0] = input_act_qspec + + # For the rest of the inputs, share qspec with first. + # If we can't quantize any of the inputs, abort annotation. + for input_act in input_acts[1:]: + if isinstance(input_act, Node): + if not arm_quantizer_utils.is_input_ok_for_quantization(input_act, gm): + continue + if input_act is not input_act0: + input_qspec_map[input_act] = shared_with_input0_qspec + + if input_qspec_map is not None: + cat_node.meta["quantization_annotation"] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=shared_with_input0_qspec, + _annotated=True, + ) + return annotated_partitions diff --git a/backends/arm/quantizer/quantization_annotation/conv_annotator.py b/backends/arm/quantizer/quantization_annotation/conv_annotator.py index 40a1f1ee9ea..4ff7dd9e800 100644 --- a/backends/arm/quantizer/quantization_annotation/conv_annotator.py +++ b/backends/arm/quantizer/quantization_annotation/conv_annotator.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree.f +# pyre-unsafe + from typing import Callable, List, Optional import torch diff --git a/backends/arm/quantizer/quantization_annotation/linear_annotator.py b/backends/arm/quantizer/quantization_annotation/linear_annotator.py index 95b881a9548..7c3f91ec707 100644 --- a/backends/arm/quantizer/quantization_annotation/linear_annotator.py +++ b/backends/arm/quantizer/quantization_annotation/linear_annotator.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import Callable, List, Optional import torch diff --git a/backends/arm/quantizer/quantization_annotation/max_pool2d_annotator.py b/backends/arm/quantizer/quantization_annotation/max_pool2d_annotator.py index 3d9d8b2e6c8..0ef2ee39fe5 100644 --- a/backends/arm/quantizer/quantization_annotation/max_pool2d_annotator.py +++ b/backends/arm/quantizer/quantization_annotation/max_pool2d_annotator.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import itertools from typing import Callable, List, Optional diff --git a/backends/arm/quantizer/quantization_annotation/mm_annotator.py b/backends/arm/quantizer/quantization_annotation/mm_annotator.py index 969f0131ffd..b48c6d59905 100644 --- a/backends/arm/quantizer/quantization_annotation/mm_annotator.py +++ b/backends/arm/quantizer/quantization_annotation/mm_annotator.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import itertools from typing import Callable, List, Optional @@ -22,7 +24,7 @@ def _annotate_mm( quantization_config: QuantizationConfig, filter_fn: Optional[Callable[[Node], bool]] = None, ) -> Optional[List[List[Node]]]: - mm_partitions = get_source_partitions(gm.graph, [torch.mm], filter_fn) + mm_partitions = get_source_partitions(gm.graph, [torch.mm, torch.bmm], filter_fn) mm_partitions = list(itertools.chain.from_iterable(mm_partitions.values())) annotated_partitions = [] for mm_partition in mm_partitions: diff --git a/backends/arm/quantizer/quantization_annotation/mul_annotator.py b/backends/arm/quantizer/quantization_annotation/mul_annotator.py index 6ec8f95531b..4717eac320d 100644 --- a/backends/arm/quantizer/quantization_annotation/mul_annotator.py +++ b/backends/arm/quantizer/quantization_annotation/mul_annotator.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import itertools import operator from typing import Callable, List, Optional diff --git a/backends/arm/quantizer/quantization_annotation/one_to_one_annotator.py b/backends/arm/quantizer/quantization_annotation/one_to_one_annotator.py new file mode 100644 index 00000000000..8d507c11ef3 --- /dev/null +++ b/backends/arm/quantizer/quantization_annotation/one_to_one_annotator.py @@ -0,0 +1,57 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +from typing import Callable, List, Optional + +import torch +import torch.fx +from executorch.backends.arm.quantizer import arm_quantizer_utils +from executorch.backends.arm.quantizer.quantization_annotation import register_annotator +from executorch.backends.arm.quantizer.quantization_config import QuantizationConfig +from torch.ao.quantization.quantizer.utils import ( + _annotate_input_qspec_map, + _annotate_output_qspec, +) +from torch.fx import Node + + +@register_annotator("one_to_one") +def _annotate_one_to_one( + gm: torch.fx.GraphModule, + quantization_config: QuantizationConfig, + filter_fn: Optional[Callable[[Node], bool]] = None, +) -> Optional[List[List[Node]]]: + """ + This annotator adds the input and output qspec from the quantization config to + ops in 'one_to_one_ops' that have the following properties: + - Have a single input and single output. + - Can handle different qspecs on the input and output. + + Typical ops are ops implemented with a lookup table. + """ + annotated_partitions = [] + one_to_one_ops = (torch.ops.aten.exp.default, torch.ops.aten.log.default) + for node in gm.graph.nodes: + if node.op != "call_function" or node.target not in one_to_one_ops: + continue + if filter_fn and not filter_fn(node): + continue + input_node = node.args[0] + + if not arm_quantizer_utils.is_annotated(node): + _annotate_input_qspec_map( + node, + input_node, + quantization_config.get_input_act_qspec(), + ) + _annotate_output_qspec(node, quantization_config.get_output_act_qspec()) + + arm_quantizer_utils.mark_nodes_as_annotated([node]) + annotated_partitions.append([node]) + + return annotated_partitions diff --git a/backends/arm/quantizer/quantization_annotation/sigmoid_annotator.py b/backends/arm/quantizer/quantization_annotation/sigmoid_annotator.py index bd683d81f0b..3d242694836 100644 --- a/backends/arm/quantizer/quantization_annotation/sigmoid_annotator.py +++ b/backends/arm/quantizer/quantization_annotation/sigmoid_annotator.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from typing import Callable, List, Optional import torch diff --git a/backends/arm/quantizer/quantization_annotation/sub_annotator.py b/backends/arm/quantizer/quantization_annotation/sub_annotator.py index 4686d480edb..92f1808d023 100644 --- a/backends/arm/quantizer/quantization_annotation/sub_annotator.py +++ b/backends/arm/quantizer/quantization_annotation/sub_annotator.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import itertools import operator from typing import Callable, List, Optional diff --git a/backends/arm/quantizer/quantization_config.py b/backends/arm/quantizer/quantization_config.py index f94c3e18da6..1e776d37a6f 100644 --- a/backends/arm/quantizer/quantization_config.py +++ b/backends/arm/quantizer/quantization_config.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + from dataclasses import dataclass import torch diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp index 7420874d8f4..6d9ab6b0091 100644 --- a/backends/arm/runtime/ArmBackendEthosU.cpp +++ b/backends/arm/runtime/ArmBackendEthosU.cpp @@ -11,9 +11,9 @@ */ #include +#include #include -#include #include "executorch/backends/arm/runtime/VelaBinStream.h" #include "executorch/runtime/backend/interface.h" @@ -31,7 +31,22 @@ typedef struct { bool permuted_io_flag; } ExecutionHandle; -class ArmBackend final : public PyTorchBackendInterface { +extern "C" { +void __attribute__((weak)) ArmBackend_execute_begin() {} +void __attribute__((weak)) ArmBackend_execute_end() {} +} + +class ArmBackendExecuteCallbacks { + public: + ArmBackendExecuteCallbacks() { + ArmBackend_execute_begin(); + } + ~ArmBackendExecuteCallbacks() { + ArmBackend_execute_end(); + } +}; + +class ArmBackend final : public ::executorch::runtime::BackendInterface { public: ArmBackend() {} @@ -82,6 +97,7 @@ class ArmBackend final : public PyTorchBackendInterface { ExecutionHandle* execution_handle = (ExecutionHandle*)input_handle; VelaHandles handles; + ArmBackendExecuteCallbacks ArmBackend_execute_callbacks; // Command stream - we know at this point it's aligned char* data = (char*)execution_handle->processed->data(); ET_LOG(Info, "ArmBackend::execute %p", data); @@ -147,8 +163,9 @@ class ArmBackend final : public PyTorchBackendInterface { if (both_char and permuted_input_shape) { // permuted byte copy CHW to HWC permute_CHW_to_HWC( - scratch_addr, tensor_in.mutable_data_ptr(), + scratch_addr, + tensor_in.size(1), tensor_in.size(2), tensor_in.size(3)); } else if (both_char or both_int) { @@ -164,8 +181,10 @@ class ArmBackend final : public PyTorchBackendInterface { } // Allocate driver handle and synchronously invoke driver - ethosu_driver* drv = ethosu_reserve_driver(); - if (drv == NULL) { + auto driver = + std::unique_ptr( + ethosu_reserve_driver(), ethosu_release_driver); + if (driver == NULL) { ET_LOG(Error, "ArmBackend::execute: ethosu_reserve_driver failed"); return Error::InvalidState; } @@ -178,7 +197,7 @@ class ArmBackend final : public PyTorchBackendInterface { size_t bases_size[2] = { handles.weight_data_size, handles.scratch_data_size}; int result = ethosu_invoke_v3( - drv, + driver.get(), (void*)handles.cmd_data, handles.cmd_data_size, bases, @@ -201,17 +220,34 @@ class ArmBackend final : public PyTorchBackendInterface { // Process input EValue into scratch // Outputs are in the index immediately after inputs auto tensor_out = args[handles.inputs->count + i]->toTensor(); - for (int j = 0; j < tensor_out.numel(); j++) { - if (tensor_out.scalar_type() == ScalarType::Char) { - char* output_address = (char*)output_addr; - tensor_out.mutable_data_ptr()[j] = output_address[j]; - } else { - int* output_address = (int*)output_addr; - tensor_out.mutable_data_ptr()[j] = output_address[j]; + bool permuted_output_shape; + ET_CHECK_OK_OR_RETURN_ERROR(check_requires_permute( + i, + tensor_out, + &handles.outputs->io[i], + execution_handle->permuted_io_flag, + &permuted_output_shape)); + if (tensor_out.scalar_type() == ScalarType::Char and + permuted_output_shape) { + char* output_address = (char*)output_addr; + permute_HWC_to_CHW( + output_address, + tensor_out.mutable_data_ptr(), + tensor_out.size(1), + tensor_out.size(2), + tensor_out.size(3)); + } else { + for (int j = 0; j < tensor_out.numel(); j++) { + if (tensor_out.scalar_type() == ScalarType::Char) { + char* output_address = (char*)output_addr; + tensor_out.mutable_data_ptr()[j] = output_address[j]; + } else { + int* output_address = (int*)output_addr; + tensor_out.mutable_data_ptr()[j] = output_address[j]; + } } } } - return Error::Ok; } @@ -222,51 +258,71 @@ class ArmBackend final : public PyTorchBackendInterface { private: Error check_requires_permute( int index, - const exec_aten::Tensor tensor_in, - VelaIO* input, + const exec_aten::Tensor tensor, + VelaIO* io, bool permuted_io_flag, bool* is_permuted) const { - bool permuted_input_shape = false; - if (tensor_in.dim() == 4) { + bool permuted_shape = false; + if (tensor.dim() == 4) { // special case for NHWC workaround in AOT; as the compilation has // permuted to channel last in an undetectable way, we assume here - // that the application has similarly permuted any input tensors. - permuted_input_shape = tensor_in.size(0) == input->shape[0] && - tensor_in.size(1) == input->shape[3] && - tensor_in.size(2) == input->shape[1] && - tensor_in.size(3) == input->shape[2]; - if (permuted_input_shape) { - ET_LOG(Info, "Tensor input %d will be permuted", index); + // that the application has similarly permuted any input/output tensors. + permuted_shape = tensor.size(0) == io->shape[0] && + tensor.size(1) == io->shape[3] && tensor.size(2) == io->shape[1] && + tensor.size(3) == io->shape[2]; + if (permuted_shape) { + ET_LOG(Info, "Tensor input/output %d will be permuted", index); } - if (permuted_io_flag != permuted_input_shape) { - ET_LOG(Error, "Permute compile flag and permuted input don't agree"); + if (permuted_io_flag != permuted_shape) { + ET_LOG( + Error, + "Permute compile flag and permuted input/output don't agree"); return Error::InvalidProgram; } } - if (!permuted_input_shape) { - // Error check matching shapes in the general case - for (int i = 0; i < tensor_in.dim(); i++) { - if (tensor_in.size(i) != input->shape[i]) { - ET_LOG(Error, "Tensor input %d mismatched shape", index); - ET_LOG( - Error, - "dimension %d mismatch, %zd != %d", - index, - tensor_in.size(i), - input->shape[i]); - return Error::InvalidProgram; - } + if (!permuted_shape) { + // Check the number of elements in each tensor match + int tensor_count = 1; + int io_count = 1; + + for (int i = 0; i < tensor.dim(); i++) { + tensor_count = tensor_count * tensor.size(i); + } + + // The VelaIO type has a shape of fixed size 4 + for (int i = 0; i < 4; i++) { + io_count = io_count * io->shape[i]; + } + + if (tensor_count != io_count) { + ET_LOG(Error, "Input tensor sizes do not match"); + ET_LOG( + Error, + "Program expects %d elements but got %d", + io_count, + tensor_count); + return Error::InvalidProgram; } } - *is_permuted = permuted_input_shape; + *is_permuted = permuted_shape; return Error::Ok; } - void permute_CHW_to_HWC(char* input, char* output, int H, int W) const { + void permute_CHW_to_HWC(char* input, char* output, int C, int H, int W) + const { for (int i = 0; i != H * W; ++i) { - output[i * 3 + 0] = input[i + 0 * W * H]; - output[i * 3 + 1] = input[i + 1 * W * H]; - output[i * 3 + 2] = input[i + 2 * W * H]; + for (int j = 0; j < C; ++j) { + output[i * C + j] = input[i + j * W * H]; + } + } + } + + void permute_HWC_to_CHW(char* input, char* output, int C, int H, int W) + const { + for (int i = 0; i != H * W; ++i) { + for (int j = 0; j < C; ++j) { + output[i + j * W * H] = input[i * C + j]; + } } } }; diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py index f85fd1f2dac..0d50f1882da 100644 --- a/backends/arm/test/common.py +++ b/backends/arm/test/common.py @@ -14,6 +14,7 @@ import torch from executorch.backends.arm.arm_backend import ArmCompileSpecBuilder +from executorch.exir.backend.compile_spec_schema import CompileSpec _enabled_options: list[str] = [] @@ -85,7 +86,9 @@ def is_option_enabled(option: str, fail_if_not_enabled: bool = False) -> bool: return False -def get_tosa_compile_spec(permute_memory_to_nhwc=True, custom_path=None): +def get_tosa_compile_spec( + permute_memory_to_nhwc=True, custom_path=None +) -> list[CompileSpec]: """ Default compile spec for TOSA tests. """ @@ -112,8 +115,8 @@ def get_tosa_compile_spec_unbuilt( def get_u55_compile_spec( - permute_memory_to_nhwc=False, quantize_io=False, custom_path=None -): + permute_memory_to_nhwc=True, quantize_io=False, custom_path=None +) -> list[CompileSpec]: """ Default compile spec for Ethos-U55 tests. """ @@ -122,10 +125,21 @@ def get_u55_compile_spec( ).build() +def get_u85_compile_spec( + permute_memory_to_nhwc=True, quantize_io=False, custom_path=None +) -> list[CompileSpec]: + """ + Default compile spec for Ethos-U85 tests. + """ + return get_u85_compile_spec_unbuilt( + permute_memory_to_nhwc, quantize_io=quantize_io, custom_path=custom_path + ).build() + + def get_u55_compile_spec_unbuilt( - permute_memory_to_nhwc=False, quantize_io=False, custom_path=None + permute_memory_to_nhwc=True, quantize_io=False, custom_path=None ) -> ArmCompileSpecBuilder: - """Get the ArmCompileSpecBuilder for the default TOSA tests, to modify + """Get the ArmCompileSpecBuilder for the Ethos-U55 tests, to modify the compile spec before calling .build() to finalize it. """ artifact_path = custom_path or tempfile.mkdtemp(prefix="arm_u55_") @@ -137,7 +151,29 @@ def get_u55_compile_spec_unbuilt( "ethos-u55-128", system_config="Ethos_U55_High_End_Embedded", memory_mode="Shared_Sram", - extra_flags=None, + extra_flags="--debug-force-regor --output-format=raw", + ) + .set_quantize_io(is_option_enabled("quantize_io") or quantize_io) + .set_permute_memory_format(permute_memory_to_nhwc) + .dump_intermediate_artifacts_to(artifact_path) + ) + return compile_spec + + +def get_u85_compile_spec_unbuilt( + permute_memory_to_nhwc=True, quantize_io=False, custom_path=None +) -> list[CompileSpec]: + """Get the ArmCompileSpecBuilder for the Ethos-U85 tests, to modify + the compile spec before calling .build() to finalize it. + """ + artifact_path = custom_path or tempfile.mkdtemp(prefix="arm_u85_") + compile_spec = ( + ArmCompileSpecBuilder() + .ethosu_compile_spec( + "ethos-u85-128", + system_config="Ethos_U85_SYS_DRAM_Mid", + memory_mode="Shared_Sram", + extra_flags="--output-format=raw", ) .set_quantize_io(is_option_enabled("quantize_io") or quantize_io) .set_permute_memory_format(permute_memory_to_nhwc) diff --git a/backends/arm/test/misc/test_lifted_tensor.py b/backends/arm/test/misc/test_lifted_tensor.py new file mode 100644 index 00000000000..90aa7e2950c --- /dev/null +++ b/backends/arm/test/misc/test_lifted_tensor.py @@ -0,0 +1,42 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester + + +class LiftedTensor(torch.nn.Module): + + def __init__(self): + super().__init__() + self.lifted_tensor = torch.Tensor([[1, 2], [3, 4]]) + + def forward(self, x: torch.Tensor, length) -> torch.Tensor: + sliced = self.lifted_tensor[:, :length] + return sliced + x + + +class TestLiftedTensor(unittest.TestCase): + """Tests the ArmPartitioner with a placeholder of type lifted tensor.""" + + def test_partition_lifted_tensor(self): + tester = ( + ArmTester( + LiftedTensor(), + example_inputs=(torch.ones(2, 2), 2), + compile_spec=common.get_tosa_compile_spec(), + ) + .export() + .to_edge() + .dump_artifact() + ) + signature = tester.get_artifact().exported_program().graph_signature + assert len(signature.lifted_tensor_constants) > 0 + tester.partition() + tester.to_executorch() + tester.run_method_and_compare_outputs((torch.ones(2, 2), 2)) diff --git a/backends/arm/test/models/test_mobilenet_v2_arm.py b/backends/arm/test/models/test_mobilenet_v2_arm.py index 248153a5180..f9d408c1bae 100644 --- a/backends/arm/test/models/test_mobilenet_v2_arm.py +++ b/backends/arm/test/models/test_mobilenet_v2_arm.py @@ -84,7 +84,7 @@ def test_mv2_tosa_BI(self): ) def test_mv2_u55_BI(self): - ( + tester = ( ArmTester( self.mv2, example_inputs=self.model_inputs, @@ -96,4 +96,24 @@ def test_mv2_u55_BI(self): .check(list(self.operators_after_quantization)) .partition() .to_executorch() + .serialize() + ) + if common.is_option_enabled("corstone300"): + tester.run_method_and_compare_outputs( + atol=1.0, qtol=1, inputs=self.model_inputs + ) + + def test_mv2_u85_BI(self): + ( + ArmTester( + self.mv2, + example_inputs=self.model_inputs, + compile_spec=common.get_u85_compile_spec(permute_memory_to_nhwc=True), + ) + .quantize() + .export() + .to_edge(config=self._edge_compile_config) + .check(list(self.operators_after_quantization)) + .partition() + .to_executorch() ) diff --git a/backends/arm/test/ops/test_add.py b/backends/arm/test/ops/test_add.py index 3bd2b2605c4..cff8af11654 100644 --- a/backends/arm/test/ops/test_add.py +++ b/backends/arm/test/ops/test_add.py @@ -13,6 +13,7 @@ from executorch.backends.arm.test import common from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.exir import EdgeCompileConfig +from executorch.exir.backend.compile_spec_schema import CompileSpec from parameterized import parameterized @@ -37,9 +38,9 @@ class Add2(torch.nn.Module): torch.FloatTensor([1, 2, 3, 5, 7]), (torch.FloatTensor([2, 1, 2, 1, 10])), ), - (torch.ones(1, 1, 4, 4), torch.ones(1, 1, 4, 4)), + (torch.ones(1, 10, 4, 6), torch.ones(1, 10, 4, 6)), (torch.randn(1, 1, 4, 4), torch.ones(1, 1, 4, 1)), - (torch.randn(1, 1, 4, 4), torch.randn(1, 1, 4, 1)), + (torch.randn(1, 3, 4, 4), torch.randn(1, 3, 4, 4)), (10000 * torch.randn(1, 1, 4, 4), torch.randn(1, 1, 4, 1)), ] @@ -92,16 +93,17 @@ def _test_add_tosa_BI_pipeline( .run_method_and_compare_outputs(inputs=test_data, qtol=1) ) - def _test_add_u55_BI_pipeline( + def _test_add_ethos_BI_pipeline( self, module: torch.nn.Module, + compile_spec: CompileSpec, test_data: Tuple[torch.Tensor], ): tester = ( ArmTester( module, example_inputs=test_data, - compile_spec=common.get_u55_compile_spec(), + compile_spec=compile_spec, ) .quantize() .export() @@ -114,8 +116,7 @@ def _test_add_u55_BI_pipeline( .serialize() ) - if common.is_option_enabled("corstone300"): - tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) + return tester @parameterized.expand(Add.test_parameters) def test_add_tosa_MI(self, test_data: torch.Tensor): @@ -130,7 +131,22 @@ def test_add_tosa_BI(self, test_data: torch.Tensor): @parameterized.expand(Add.test_parameters) def test_add_u55_BI(self, test_data: torch.Tensor): test_data = (test_data,) - self._test_add_u55_BI_pipeline(self.Add(), test_data) + tester = self._test_add_ethos_BI_pipeline( + self.Add(), + common.get_u55_compile_spec(permute_memory_to_nhwc=True), + test_data, + ) + if common.is_option_enabled("corstone300"): + tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) + + @parameterized.expand(Add.test_parameters) + def test_add_u85_BI(self, test_data: torch.Tensor): + test_data = (test_data,) + self._test_add_ethos_BI_pipeline( + self.Add(), + common.get_u85_compile_spec(permute_memory_to_nhwc=True), + test_data, + ) @parameterized.expand(Add2.test_parameters) def test_add2_tosa_MI(self, operand1: torch.Tensor, operand2: torch.Tensor): @@ -145,4 +161,15 @@ def test_add2_tosa_BI(self, operand1: torch.Tensor, operand2: torch.Tensor): @parameterized.expand(Add2.test_parameters) def test_add2_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor): test_data = (operand1, operand2) - self._test_add_u55_BI_pipeline(self.Add2(), test_data) + tester = self._test_add_ethos_BI_pipeline( + self.Add2(), common.get_u55_compile_spec(), test_data + ) + if common.is_option_enabled("corstone300"): + tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) + + @parameterized.expand(Add2.test_parameters) + def test_add2_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor): + test_data = (operand1, operand2) + self._test_add_ethos_BI_pipeline( + self.Add2(), common.get_u85_compile_spec(), test_data + ) diff --git a/backends/arm/test/ops/test_avg_pool.py b/backends/arm/test/ops/test_avg_pool.py index 32a0e5555a3..6c14420dbcf 100644 --- a/backends/arm/test/ops/test_avg_pool.py +++ b/backends/arm/test/ops/test_avg_pool.py @@ -13,6 +13,7 @@ import torch from executorch.backends.arm.test import common from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.exir.backend.backend_details import CompileSpec from parameterized import parameterized logger = logging.getLogger(__name__) @@ -86,14 +87,17 @@ def _test_avgpool2d_tosa_BI_pipeline( .run_method_and_compare_outputs(inputs=test_data, qtol=1) ) - def _test_avgpool2d_tosa_u55_BI_pipeline( - self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + def _test_avgpool2d_tosa_ethos_BI_pipeline( + self, + module: torch.nn.Module, + compile_spec: CompileSpec, + test_data: Tuple[torch.tensor], ): ( ArmTester( module, example_inputs=test_data, - compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=True), + compile_spec=compile_spec, ) .quantize() .export() @@ -141,6 +145,22 @@ def test_avgpool2d_tosa_u55_BI( test_data: torch.Tensor, model_params: int | Tuple[int, int], ): - self._test_avgpool2d_tosa_u55_BI_pipeline( - self.AvgPool2d(*model_params), (test_data,) + self._test_avgpool2d_tosa_ethos_BI_pipeline( + self.AvgPool2d(*model_params), + common.get_u55_compile_spec(permute_memory_to_nhwc=True), + (test_data,), + ) + + @parameterized.expand(test_data_suite) + @unittest.expectedFailure + def test_avgpool2d_tosa_u85_BI( + self, + test_name: str, + test_data: torch.Tensor, + model_params: int | Tuple[int, int], + ): + self._test_avgpool2d_tosa_ethos_BI_pipeline( + self.AvgPool2d(*model_params), + common.get_u85_compile_spec(permute_memory_to_nhwc=True), + (test_data,), ) diff --git a/backends/arm/test/ops/test_bmm.py b/backends/arm/test/ops/test_bmm.py new file mode 100644 index 00000000000..e4e6abb7bb3 --- /dev/null +++ b/backends/arm/test/ops/test_bmm.py @@ -0,0 +1,148 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +from typing import Tuple + +import torch +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.exir.backend.compile_spec_schema import CompileSpec +from parameterized import parameterized + +torch.manual_seed(1) + + +class TestBMM(unittest.TestCase): + """Tests Batch MatMul""" + + class BMM(torch.nn.Module): + test_parameters = [ + (torch.rand(5, 3, 5), torch.rand(5, 5, 2)), + (torch.rand(2, 1, 1), torch.rand(2, 1, 1)), + (torch.ones(1, 55, 3), torch.ones(1, 3, 44)), + (10000 * torch.randn(10, 1, 10), torch.randn(10, 10, 5)), + (-10 * torch.randn(2, 32, 64), 5 + 5 * torch.randn(2, 64, 32)), + ] + + def forward(self, x, y): + return torch.bmm(x, y) + + class BMMSingleInput(torch.nn.Module): + test_parameters = [ + (torch.rand(20, 3, 3),), + (torch.ones(2, 128, 128),), + (10000 * torch.randn(4, 25, 25),), + (5 + 5 * torch.randn(3, 64, 64),), + ] + + def forward(self, x): + return torch.bmm(x, x) + + def _test_bmm_tosa_MI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.Tensor, ...] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .export() + .check_count({"torch.ops.aten.bmm.default": 1}) + .check_not(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_bmm_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_bmm_tosa_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.Tensor, ...] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize() + .export() + .check_count({"torch.ops.aten.bmm.default": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_bmm_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_bmm_ethosu_BI_pipeline( + self, + module: torch.nn.Module, + compile_spec: CompileSpec, + test_data: Tuple[torch.Tensor, ...], + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=compile_spec, + ) + .quantize() + .export() + .check_count({"torch.ops.aten.bmm.default": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + ) + + @parameterized.expand(BMM.test_parameters) + def test_bmm_tosa_MI(self, operand1: torch.Tensor, operand2: torch.Tensor): + test_data = (operand1, operand2) + self._test_bmm_tosa_MI_pipeline(self.BMM(), test_data) + + @parameterized.expand(BMMSingleInput.test_parameters) + def test_bmm_single_input_tosa_MI(self, operand1: torch.Tensor): + test_data = (operand1,) + self._test_bmm_tosa_MI_pipeline(self.BMMSingleInput(), test_data) + + @parameterized.expand(BMM.test_parameters) + def test_bmm_tosa_BI(self, operand1: torch.Tensor, operand2: torch.Tensor): + test_data = (operand1, operand2) + self._test_bmm_tosa_BI_pipeline(self.BMM(), test_data) + + @parameterized.expand(BMMSingleInput.test_parameters) + def test_bmm_single_input_tosa_BI(self, operand1: torch.Tensor): + test_data = (operand1,) + self._test_bmm_tosa_BI_pipeline(self.BMMSingleInput(), test_data) + + @parameterized.expand(BMM.test_parameters) + def test_bmm_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor): + test_data = (operand1, operand2) + self._test_bmm_tosa_BI_pipeline(self.BMM(), test_data) + + # Expected to fail with error: Warning, unsupported fusing of TOSA Rescale previous operator is of type: Memcpy + @parameterized.expand(BMMSingleInput.test_parameters) + @unittest.expectedFailure + def test_bmm_single_input_u55_BI(self, operand1: torch.Tensor): + test_data = (operand1,) + self._test_bmm_ethosu_BI_pipeline( + self.BMMSingleInput(), common.get_u55_compile_spec(), test_data + ) + + @parameterized.expand(BMMSingleInput.test_parameters) + def test_bmm_single_input_u85_BI(self, operand1: torch.Tensor): + test_data = (operand1,) + self._test_bmm_ethosu_BI_pipeline( + self.BMMSingleInput(), common.get_u85_compile_spec(), test_data + ) diff --git a/backends/arm/test/ops/test_cat.py b/backends/arm/test/ops/test_cat.py new file mode 100644 index 00000000000..9723ba0f0c0 --- /dev/null +++ b/backends/arm/test/ops/test_cat.py @@ -0,0 +1,144 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +from typing import Tuple + +import torch +from executorch.backends.arm.test import common + +from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.exir.backend.compile_spec_schema import CompileSpec +from parameterized import parameterized + + +class TestCat(unittest.TestCase): + + class Cat(torch.nn.Module): + test_parameters = [ + ((torch.ones(1), torch.ones(1)), 0), + ((torch.ones(1, 2), torch.randn(1, 5), torch.randn(1, 1)), 1), + ( + ( + torch.ones(1, 2, 5), + torch.randn(1, 2, 4), + torch.randn(1, 2, 2), + torch.randn(1, 2, 1), + ), + -1, + ), + ((torch.randn(2, 2, 4, 4), torch.randn(2, 2, 4, 1)), 3), + ( + ( + 10000 * torch.randn(2, 3, 1, 4), + torch.randn(2, 7, 1, 4), + torch.randn(2, 1, 1, 4), + ), + -3, + ), + ] + + def __init__(self): + super().__init__() + + def forward(self, tensors: tuple[torch.Tensor, ...], dim: int) -> torch.Tensor: + return torch.cat(tensors, dim=dim) + + def _test_cat_tosa_MI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[tuple[torch.Tensor, ...], int] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .export() + .check_count({"torch.ops.aten.cat.default": 1}) + .check_not(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_cat_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_cat_tosa_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[tuple[torch.Tensor, ...], int] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize() + .export() + .check_count({"torch.ops.aten.cat.default": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_cat_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data, qtol=1) + ) + + def _test_cat_ethosu_BI_pipeline( + self, + module: torch.nn.Module, + compile_spec: CompileSpec, + test_data: Tuple[tuple[torch.Tensor, ...], int], + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=compile_spec, + ) + .quantize() + .export() + .check_count({"torch.ops.aten.cat.default": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_cat_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + ) + + @parameterized.expand(Cat.test_parameters) + def test_cat_tosa_MI(self, operands: tuple[torch.Tensor, ...], dim: int): + test_data = (operands, dim) + self._test_cat_tosa_MI_pipeline(self.Cat(), test_data) + + def test_cat_4d_tosa_MI(self): + square = torch.ones((2, 2, 2, 2)) + for dim in range(-3, 3): + test_data = ((square, square), dim) + self._test_cat_tosa_MI_pipeline(self.Cat(), test_data) + + @parameterized.expand(Cat.test_parameters) + def test_cat_tosa_BI(self, operands: tuple[torch.Tensor, ...], dim: int): + test_data = (operands, dim) + self._test_cat_tosa_BI_pipeline(self.Cat(), test_data) + + @parameterized.expand(Cat.test_parameters) + def test_cat_u55_BI(self, operands: tuple[torch.Tensor, ...], dim: int): + test_data = (operands, dim) + self._test_cat_ethosu_BI_pipeline( + self.Cat(), common.get_u55_compile_spec(), test_data + ) + + @parameterized.expand(Cat.test_parameters) + def test_cat_u85_BI(self, operands: tuple[torch.Tensor, ...], dim: int): + test_data = (operands, dim) + self._test_cat_ethosu_BI_pipeline( + self.Cat(), common.get_u85_compile_spec(), test_data + ) diff --git a/backends/arm/test/ops/test_clone.py b/backends/arm/test/ops/test_clone.py index 8386283f24e..9852c5c4520 100644 --- a/backends/arm/test/ops/test_clone.py +++ b/backends/arm/test/ops/test_clone.py @@ -21,6 +21,8 @@ from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.backends.xnnpack.test.tester.tester import Quantize + +from executorch.exir.backend.compile_spec_schema import CompileSpec from parameterized import parameterized @@ -76,16 +78,15 @@ def _test_clone_tosa_BI_pipeline( .run_method_and_compare_outputs(inputs=test_data, qtol=1) ) - def _test_clone_tosa_u55_pipeline( - self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] + def _test_clone_tosa_ethos_pipeline( + self, + compile_spec: list[CompileSpec], + module: torch.nn.Module, + test_data: Tuple[torch.Tensor], ): quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) ( - ArmTester( - module, - example_inputs=test_data, - compile_spec=common.get_u55_compile_spec(), - ) + ArmTester(module, example_inputs=test_data, compile_spec=compile_spec) .quantize(Quantize(quantizer, get_symmetric_quantization_config())) .export() .check_count({"torch.ops.aten.clone.default": 1}) @@ -95,6 +96,20 @@ def _test_clone_tosa_u55_pipeline( .to_executorch() ) + def _test_clone_tosa_u55_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] + ): + self._test_clone_tosa_ethos_pipeline( + common.get_u55_compile_spec(), module, test_data + ) + + def _test_clone_tosa_u85_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] + ): + self._test_clone_tosa_ethos_pipeline( + common.get_u85_compile_spec(), module, test_data + ) + @parameterized.expand(Clone.test_parameters) def test_clone_tosa_MI(self, test_tensor: torch.Tensor): self._test_clone_tosa_MI_pipeline(self.Clone(), (test_tensor,)) @@ -106,3 +121,7 @@ def test_clone_tosa_BI(self, test_tensor: torch.Tensor): @parameterized.expand(Clone.test_parameters) def test_clone_u55_BI(self, test_tensor: torch.Tensor): self._test_clone_tosa_u55_pipeline(self.Clone(), (test_tensor,)) + + @parameterized.expand(Clone.test_parameters) + def test_clone_u85_BI(self, test_tensor: torch.Tensor): + self._test_clone_tosa_u85_pipeline(self.Clone(), (test_tensor,)) diff --git a/backends/arm/test/ops/test_conv.py b/backends/arm/test/ops/test_conv.py index 9ebfe77da2c..286404922f2 100644 --- a/backends/arm/test/ops/test_conv.py +++ b/backends/arm/test/ops/test_conv.py @@ -4,7 +4,6 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import logging import unittest from typing import List, Tuple, Union @@ -13,11 +12,9 @@ from executorch.backends.arm.test import common from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.exir.backend.compile_spec_schema import CompileSpec from parameterized import parameterized -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - class Conv2d(torch.nn.Module): """ @@ -159,14 +156,14 @@ def forward(self, x): batches=1, ) -conv2d_2x2_1x1x14x14_st2 = Conv2d( +conv2d_2x2_1x1x14x13_st2 = Conv2d( in_channels=1, out_channels=1, kernel_size=(2, 2), stride=2, padding=0, width=14, - height=14, + height=13, batches=1, ) @@ -192,6 +189,18 @@ def forward(self, x): batches=1, ) +conv2d_5x5_1x3x14x15_st3_pd1 = Conv2d( + in_channels=3, + out_channels=16, + kernel_size=(5, 5), + stride=3, + padding=1, + width=14, + height=15, + batches=1, +) + + two_conv2d_nobias = Conv2d( nbr_conv=2, width=256, @@ -225,7 +234,8 @@ def forward(self, x): ("3x3_1x3x256x256_st1", conv2d_3x3_1x3x256x256_st1), ("3x3_1x3x12x12_st2_pd1", conv2d_3x3_1x3x12x12_st2_pd1), ("1x1_1x2x128x128_st1", conv2d_1x1_1x2x128x128_st1), - ("2x2_1x1x14x14_st2", conv2d_2x2_1x1x14x14_st2), + ("2x2_1x1x14x13_st2_needs_adjust_pass", conv2d_2x2_1x1x14x13_st2), + ("conv2d_5x5_1x3x14x15_st3_pd1_needs_adjust_pass", conv2d_5x5_1x3x14x15_st3_pd1), ("5x5_3x2x128x128_st1", conv2d_5x5_3x2x128x128_st1), ("3x3_1x3x224x224_st2_pd1", conv2d_3x3_1x3x224x224_st2_pd1), ("two_conv2d_nobias", two_conv2d_nobias), @@ -240,7 +250,10 @@ def forward(self, x): testsuite_u55.remove(("5x5_3x2x128x128_st1", conv2d_5x5_3x2x128x128_st1)) # Fails when enabling CompileSpec.set_quantize_io(True). MLETORCH-191. -testsuite_u55.remove(("2x2_1x1x14x14_st2", conv2d_2x2_1x1x14x14_st2)) +testsuite_u55.remove(("2x2_1x1x14x13_st2_needs_adjust_pass", conv2d_2x2_1x1x14x13_st2)) +testsuite_u55.remove( + ("conv2d_5x5_1x3x14x15_st3_pd1_needs_adjust_pass", conv2d_5x5_1x3x14x15_st3_pd1) +) class TestConv2D(unittest.TestCase): @@ -285,14 +298,17 @@ def _test_conv2d_tosa_BI_pipeline( .run_method_and_compare_outputs(inputs=test_data, qtol=1) ) - def _test_conv2d_u55_BI_pipeline( - self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] + def _test_conv2d_ethosu_BI_pipeline( + self, + compile_spec: CompileSpec, + module: torch.nn.Module, + test_data: Tuple[torch.Tensor], ): ( ArmTester( module, example_inputs=test_data, - compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=True), + compile_spec=compile_spec, ) .quantize() .export() @@ -313,4 +329,16 @@ def test_conv2d_tosa_BI(self, test_name, model): @parameterized.expand(testsuite_u55) def test_conv2d_u55_BI(self, test_name, model): - self._test_conv2d_u55_BI_pipeline(model, model.get_inputs()) + self._test_conv2d_ethosu_BI_pipeline( + common.get_u55_compile_spec(permute_memory_to_nhwc=True), + model, + model.get_inputs(), + ) + + @parameterized.expand(testsuite_u55) + def test_conv2d_u85_BI(self, test_name, model): + self._test_conv2d_ethosu_BI_pipeline( + common.get_u85_compile_spec(permute_memory_to_nhwc=True), + model, + model.get_inputs(), + ) diff --git a/backends/arm/test/ops/test_conv_combos.py b/backends/arm/test/ops/test_conv_combos.py index 88006df1a01..1fe4f1b5a57 100644 --- a/backends/arm/test/ops/test_conv_combos.py +++ b/backends/arm/test/ops/test_conv_combos.py @@ -12,6 +12,7 @@ import torch from executorch.backends.arm.test import common from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.exir.backend.backend_details import CompileSpec from parameterized import parameterized logger = logging.getLogger(__name__) @@ -102,7 +103,7 @@ def forward(self, x): return self.adaptive_avg_pool2d(x) -class ComboConvBatchnormRelu(torch.nn.Module): +class ComboConvBatchnormRelu6(torch.nn.Module): edge_op_list = [ "executorch_exir_dialects_edge__ops_aten_convolution_default", "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default", @@ -199,14 +200,17 @@ def _test_conv_combo_tosa_BI_pipeline( ) ) - def _test_conv_combo_u55_BI_pipeline( - self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] + def _test_conv_combo_ethos_BI_pipeline( + self, + module: torch.nn.Module, + compile_spec: CompileSpec, + test_data: Tuple[torch.Tensor], ): ( ArmTester( module, example_inputs=test_data, - compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=True), + compile_spec=compile_spec, ) .quantize() .export() @@ -230,22 +234,44 @@ def test_conv_meandim_tosa_BI(self): def test_conv_meandim_u55_BI(self): model = ComboConv2dMeandim() - self._test_conv_combo_u55_BI_pipeline(model, model.get_inputs()) + self._test_conv_combo_ethos_BI_pipeline( + model, + common.get_u55_compile_spec(permute_memory_to_nhwc=True), + model.get_inputs(), + ) + + def test_conv_meandim_u85_BI(self): + model = ComboConv2dMeandim() + self._test_conv_combo_ethos_BI_pipeline( + model, + common.get_u85_compile_spec(permute_memory_to_nhwc=True), + model.get_inputs(), + ) ############################## ## Conv + batch norm + relu ## ############################## - def test_conv_batchnorm_relu_tosa_MI(self): - model = ComboConvBatchnormRelu() + def test_conv_batchnorm_relu6_tosa_MI(self): + model = ComboConvBatchnormRelu6() self._test_conv_combo_tosa_MI_pipeline(model, model.get_inputs()) - def test_conv_batchnorm_relu_tosa_BI(self): - model = ComboConvBatchnormRelu() + def test_conv_batchnorm_relu6_tosa_BI(self): + model = ComboConvBatchnormRelu6() self._test_conv_combo_tosa_BI_pipeline(model, model.get_inputs()) - def test_conv_batchnorm_relu_u55_BI(self): - model = ComboConvBatchnormRelu() - self._test_conv_combo_u55_BI_pipeline(model, model.get_inputs()) + def test_conv_batchnorm_relu6_u55_BI(self): + model = ComboConvBatchnormRelu6() + self._test_conv_combo_ethos_BI_pipeline( + model, common.get_u55_compile_spec(), model.get_inputs() + ) + + def test_conv_batchnorm_relu_u85_BI(self): + model = ComboConvBatchnormRelu6() + self._test_conv_combo_ethos_BI_pipeline( + model, + common.get_u85_compile_spec(), + model.get_inputs(), + ) ################## ## Conv + ReLU6 ## @@ -266,7 +292,17 @@ def test_conv_relu6_tosa_BI(self, test_data: torch.Tensor): def test_conv_relu6_u55_BI(self, test_data: torch.Tensor): model = ComboConvRelu6() test_data = (test_data,) - self._test_conv_combo_u55_BI_pipeline(model, test_data) + self._test_conv_combo_ethos_BI_pipeline( + model, common.get_u55_compile_spec(permute_memory_to_nhwc=True), test_data + ) + + @parameterized.expand(ComboConvRelu6.test_data) + def test_conv_relu6_u85_BI(self, test_data: torch.Tensor): + model = ComboConvRelu6() + test_data = (test_data,) + self._test_conv_combo_ethos_BI_pipeline( + model, common.get_u85_compile_spec(permute_memory_to_nhwc=True), test_data + ) ############################### ## Block bottleneck residual ## @@ -281,4 +317,16 @@ def test_block_bottleneck_residual_tosa_BI(self): def test_block_bottleneck_residual_u55_BI(self): model = ComboBlockBottleneckResidual() - self._test_conv_combo_u55_BI_pipeline(model, model.get_inputs()) + self._test_conv_combo_ethos_BI_pipeline( + model, + common.get_u55_compile_spec(permute_memory_to_nhwc=True), + model.get_inputs(), + ) + + def test_block_bottleneck_residual_u85_BI(self): + model = ComboBlockBottleneckResidual() + self._test_conv_combo_ethos_BI_pipeline( + model, + common.get_u85_compile_spec(permute_memory_to_nhwc=True), + model.get_inputs(), + ) diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py index 9b3f79e6a11..11b9e4876bb 100644 --- a/backends/arm/test/ops/test_depthwise_conv.py +++ b/backends/arm/test/ops/test_depthwise_conv.py @@ -16,6 +16,7 @@ from executorch.backends.arm.test.ops.test_conv import Conv2d from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.exir.backend.backend_details import CompileSpec from parameterized import parameterized logger = logging.getLogger(__name__) @@ -172,14 +173,17 @@ def _test_dw_conv2d_tosa_BI_pipeline( .run_method_and_compare_outputs(inputs=test_data, qtol=1) ) - def _test_dw_conv2d_u55_BI_pipeline( - self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] + def _test_dw_conv2d_ethos_BI_pipeline( + self, + module: torch.nn.Module, + compile_spec: CompileSpec, + test_data: Tuple[torch.Tensor], ): ( ArmTester( module, example_inputs=test_data, - compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=True), + compile_spec=compile_spec, ) .quantize() .export() @@ -191,16 +195,35 @@ def _test_dw_conv2d_u55_BI_pipeline( ) @parameterized.expand(testsuite) - def test_dw_conv2d_tosa_MI(self, test_name, model): + def test_dw_conv2d_tosa_MI(self, test_name: str, model: torch.nn.Module): self._test_dw_conv2d_tosa_MI_pipeline(model, model.get_inputs()) # TODO: Investigate flakyness (MLTORCH-307) @parameterized.expand(testsuite) @pytest.mark.flaky(reruns=3) - def test_dw_conv2d_tosa_BI(self, test_name, model): + def test_dw_conv2d_tosa_BI(self, test_name: str, model: torch.nn.Module): self._test_dw_conv2d_tosa_BI_pipeline(model, model.get_inputs()) @parameterized.expand(testsuite_u55, skip_on_empty=True) - @unittest.expectedFailure - def test_dw_conv2d_u55_BI(self, test_name, model): - self._test_dw_conv2d_u55_BI_pipeline(model, model.get_inputs()) + def test_dw_conv2d_u55_BI( + self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False + ): + self._test_dw_conv2d_ethos_BI_pipeline( + model, + common.get_u55_compile_spec( + permute_memory_to_nhwc=True, quantize_io=set_quantize_io + ), + model.get_inputs(), + ) + + @parameterized.expand(testsuite) + def test_dw_conv2d_u85_BI( + self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False + ): + self._test_dw_conv2d_ethos_BI_pipeline( + model, + common.get_u85_compile_spec( + permute_memory_to_nhwc=True, quantize_io=set_quantize_io + ), + model.get_inputs(), + ) diff --git a/backends/arm/test/ops/test_exp.py b/backends/arm/test/ops/test_exp.py new file mode 100644 index 00000000000..6e85d8fe49b --- /dev/null +++ b/backends/arm/test/ops/test_exp.py @@ -0,0 +1,120 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +from typing import Tuple + +import torch +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.exir.backend.backend_details import CompileSpec +from parameterized import parameterized + +test_data_suite = [ + # (test_name, test_data) + ("zeros", torch.zeros(1, 10, 10, 10)), + ("ones", torch.ones(10, 10, 10)), + ("rand", torch.rand(10, 10) - 0.5), + ("randn_pos", torch.randn(10) + 10), + ("randn_neg", torch.randn(10) - 10), + ("ramp", torch.arange(-16, 16, 0.2)), +] + + +class TestExp(unittest.TestCase): + """Tests lowering of aten.exp""" + + class Exp(torch.nn.Module): + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.exp(x) + + def _test_exp_tosa_MI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .export() + .check(["torch.ops.aten.exp.default"]) + .check_not(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_exp_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_exp_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize() + .export() + .check(["torch.ops.aten.exp.default"]) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_exp_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_exp_ethosu_BI_pipeline( + self, + compile_spec: CompileSpec, + module: torch.nn.Module, + test_data: Tuple[torch.tensor], + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_u55_compile_spec(), + ) + .quantize() + .export() + .check_count({"torch.ops.aten.exp.default": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_exp_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + ) + + @parameterized.expand(test_data_suite) + def test_exp_tosa_MI( + self, + test_name: str, + test_data: torch.Tensor, + ): + self._test_exp_tosa_MI_pipeline(self.Exp(), (test_data,)) + + @parameterized.expand(test_data_suite) + def test_exp_tosa_BI(self, test_name: str, test_data: torch.Tensor): + self._test_exp_tosa_BI_pipeline(self.Exp(), (test_data,)) + + @parameterized.expand(test_data_suite) + def test_exp_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor): + self._test_exp_ethosu_BI_pipeline( + common.get_u55_compile_spec(), self.Exp(), (test_data,) + ) + + @parameterized.expand(test_data_suite) + def test_exp_tosa_u85_BI(self, test_name: str, test_data: torch.Tensor): + self._test_exp_ethosu_BI_pipeline( + common.get_u85_compile_spec(), self.Exp(), (test_data,) + ) diff --git a/backends/arm/test/ops/test_expand.py b/backends/arm/test/ops/test_expand.py index 66c081a544c..e9bbea9a5e5 100644 --- a/backends/arm/test/ops/test_expand.py +++ b/backends/arm/test/ops/test_expand.py @@ -76,7 +76,9 @@ def _test_expand_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tupl .run_method_and_compare_outputs(inputs=test_data, qtol=1) ) - def _test_expand_tosa_u55_pipeline(self, module: torch.nn.Module, test_data: Tuple): + def _test_expand_ethosu_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple + ): quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) ( ArmTester( @@ -104,6 +106,15 @@ def test_expand_tosa_BI(self, test_input, multiples): # Expected failure since tosa.TILE is unsupported by Vela. @parameterized.expand(Expand.test_parameters) - @unittest.expectedFailure + @unittest.expectedFailure # TODO: MLBEDSW-9386 def test_expand_u55_BI(self, test_input, multiples): - self._test_expand_tosa_u55_pipeline(self.Expand(), (test_input, multiples)) + self._test_expand_ethosu_BI_pipeline( + self.Expand(), common.get_u55_compile_spec(), (test_input, multiples) + ) + + @parameterized.expand(Expand.test_parameters) + @unittest.expectedFailure # TODO: MLBEDSW-9386 + def test_expand_u85_BI(self, test_input, multiples): + self._test_expand_ethosu_BI_pipeline( + self.Expand(), common.get_u85_compile_spec(), (test_input, multiples) + ) diff --git a/backends/arm/test/ops/test_full.py b/backends/arm/test/ops/test_full.py index 1be7f59ab8f..2722edef328 100644 --- a/backends/arm/test/ops/test_full.py +++ b/backends/arm/test/ops/test_full.py @@ -15,6 +15,7 @@ import torch from executorch.backends.arm.test import common from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.exir.backend.compile_spec_schema import CompileSpec from parameterized import parameterized @@ -93,13 +94,11 @@ def _test_full_tosa_BI_pipeline( .run_method_and_compare_outputs(inputs=test_data) ) - def _test_full_tosa_u55_pipeline(self, module: torch.nn.Module, test_data: Tuple): + def _test_full_tosa_ethos_pipeline( + self, compile_spec: list[CompileSpec], module: torch.nn.Module, test_data: Tuple + ): ( - ArmTester( - module, - example_inputs=test_data, - compile_spec=common.get_u55_compile_spec(), - ) + ArmTester(module, example_inputs=test_data, compile_spec=compile_spec) .quantize() .export() .check_count({"torch.ops.aten.full.default": 1}) @@ -110,6 +109,16 @@ def _test_full_tosa_u55_pipeline(self, module: torch.nn.Module, test_data: Tuple .to_executorch() ) + def _test_full_tosa_u55_pipeline(self, module: torch.nn.Module, test_data: Tuple): + self._test_full_tosa_ethos_pipeline( + common.get_u55_compile_spec(), module, test_data + ) + + def _test_full_tosa_u85_pipeline(self, module: torch.nn.Module, test_data: Tuple): + self._test_full_tosa_ethos_pipeline( + common.get_u85_compile_spec(), module, test_data + ) + def test_only_full_tosa_MI(self): self._test_full_tosa_MI_pipeline(self.Full(), ()) @@ -138,6 +147,13 @@ def test_full_u55_BI(self, test_tensor: Tuple): test_tensor, ) + @parameterized.expand(AddVariableFull.test_parameters) + def test_full_u85_BI(self, test_tensor: Tuple): + self._test_full_tosa_u85_pipeline( + self.AddVariableFull(), + test_tensor, + ) + # This fails since full outputs int64 by default if 'fill_value' is integer, which our backend doesn't support. @unittest.expectedFailure def test_integer_value(self): diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py index 33f62955ecd..3f68ab0251a 100644 --- a/backends/arm/test/ops/test_linear.py +++ b/backends/arm/test/ops/test_linear.py @@ -15,6 +15,7 @@ from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.exir import EdgeCompileConfig +from executorch.exir.backend.compile_spec_schema import CompileSpec from parameterized import parameterized logger = logging.getLogger(__name__) @@ -26,17 +27,17 @@ ( "model_linear_rank1_zeros", torch.zeros(10), - 10, + 15, ), ( "model_linear_rank1_ones", torch.ones(10), - 10, + 15, ), ( "model_linear_rank1_negative_ones", torch.ones(10) * (-1), - 10, + 20, ), ( "model_linear_rank1_rand", @@ -46,12 +47,12 @@ ( "model_linear_rank1_negative_large_rand", torch.rand(10) * (-100), - 10, + 30, ), ( "model_linear_rank1_large_randn", - torch.randn(10) * 100, - 10, + torch.randn(15) * 100, + 20, ), ] @@ -153,14 +154,17 @@ def _test_linear_tosa_BI_pipeline( .run_method_and_compare_outputs(inputs=test_data, qtol=True) ) - def _test_linear_tosa_u55_BI_pipeline( - self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] - ): + def _test_linear_tosa_ethosu_BI_pipeline( + self, + module: torch.nn.Module, + compile_spec: CompileSpec, + test_data: Tuple[torch.Tensor], + ) -> ArmTester: tester = ( ArmTester( module, example_inputs=test_data, - compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=False), + compile_spec=compile_spec, ) .quantize() .export() @@ -172,9 +176,7 @@ def _test_linear_tosa_u55_BI_pipeline( .to_executorch() .serialize() ) - - if common.is_option_enabled("corstone300"): - tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) + return tester @parameterized.expand(test_data_suite_rank1 + test_data_suite_rank4) def test_linear_tosa_MI( @@ -215,10 +217,32 @@ def test_linear_tosa_u55_BI( ): in_features = test_data.shape[-1] test_data = (test_data,) - self._test_linear_tosa_u55_BI_pipeline( + tester = self._test_linear_tosa_ethosu_BI_pipeline( + self.Linear( + in_features=in_features, + out_features=out_features, + ), + common.get_u55_compile_spec(permute_memory_to_nhwc=False), + test_data, + ) + + if common.is_option_enabled("corstone300"): + tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) + + @parameterized.expand(test_data_suite_rank1) + def test_linear_tosa_u85_BI( + self, + test_name: str, + test_data: torch.Tensor, + out_features: int, + ): + in_features = test_data.shape[-1] + test_data = (test_data,) + self._test_linear_tosa_ethosu_BI_pipeline( self.Linear( in_features=in_features, out_features=out_features, ), + common.get_u85_compile_spec(permute_memory_to_nhwc=False), test_data, ) diff --git a/backends/arm/test/ops/test_log.py b/backends/arm/test/ops/test_log.py new file mode 100644 index 00000000000..269b7be25f5 --- /dev/null +++ b/backends/arm/test/ops/test_log.py @@ -0,0 +1,120 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +from typing import Tuple + +import torch +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.exir.backend.backend_details import CompileSpec +from parameterized import parameterized + +test_data_suite = [ + # (test_name, test_data) + ("ones_rank4", torch.ones(1, 10, 10, 10)), + ("ones_rank3", torch.ones(10, 10, 10)), + ("rand", torch.rand(10, 10) + 0.001), + ("randn_pos", torch.randn(10) + 10), + ("randn_spread", torch.max(torch.Tensor([0.0]), torch.randn(10) * 100)), + ("ramp", torch.arange(0.01, 20, 0.2)), +] + + +class TestLog(unittest.TestCase): + """Tests lowering of aten.log""" + + class Log(torch.nn.Module): + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.log(x) + + def _test_log_tosa_MI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .export() + .check(["torch.ops.aten.log.default"]) + .check_not(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_log_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_log_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize() + .export() + .check(["torch.ops.aten.log.default"]) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_log_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_log_ethosu_BI_pipeline( + self, + compile_spec: CompileSpec, + module: torch.nn.Module, + test_data: Tuple[torch.tensor], + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=compile_spec, + ) + .quantize() + .export() + .check_count({"torch.ops.aten.log.default": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_log_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + ) + + @parameterized.expand(test_data_suite) + def test_log_tosa_MI( + self, + test_name: str, + test_data: torch.Tensor, + ): + self._test_log_tosa_MI_pipeline(self.Log(), (test_data,)) + + @parameterized.expand(test_data_suite) + def test_log_tosa_BI(self, test_name: str, test_data: torch.Tensor): + self._test_log_tosa_BI_pipeline(self.Log(), (test_data,)) + + @parameterized.expand(test_data_suite) + def test_log_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor): + self._test_log_ethosu_BI_pipeline( + common.get_u55_compile_spec(), self.Log(), (test_data,) + ) + + @parameterized.expand(test_data_suite) + def test_log_tosa_u85_BI(self, test_name: str, test_data: torch.Tensor): + self._test_log_ethosu_BI_pipeline( + common.get_u85_compile_spec(), self.Log(), (test_data,) + ) diff --git a/backends/arm/test/ops/test_mean_dim.py b/backends/arm/test/ops/test_mean_dim.py index e0db958f743..0653e84e704 100644 --- a/backends/arm/test/ops/test_mean_dim.py +++ b/backends/arm/test/ops/test_mean_dim.py @@ -13,6 +13,7 @@ import torch from executorch.backends.arm.test import common from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.exir.backend.backend_details import CompileSpec from parameterized import parameterized logger = logging.getLogger(__name__) @@ -91,14 +92,17 @@ def _test_meandim_tosa_BI_pipeline( .run_method_and_compare_outputs(inputs=test_data, qtol=1) ) - def _test_meandim_tosa_u55_BI_pipeline( - self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + def _test_meandim_tosa_ethosu_BI_pipeline( + self, + module: torch.nn.Module, + compile_spec: CompileSpec, + test_data: Tuple[torch.tensor], ): ( ArmTester( module, example_inputs=test_data, - compile_spec=common.get_u55_compile_spec(), + compile_spec=compile_spec, ) .quantize() .export() @@ -106,7 +110,12 @@ def _test_meandim_tosa_u55_BI_pipeline( .check(["torch.ops.quantized_decomposed"]) .to_edge() .partition() - .check_not(["executorch_exir_dialects_edge__ops_aten_mean_dim"]) + .check_not( + [ + "executorch_exir_dialects_edge__ops_aten_mean_dim", + "executorch_exir_dialects_edge__ops_aten_avg_pool2d_default", + ] + ) .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() ) @@ -133,4 +142,20 @@ def test_meandim_tosa_u55_BI( test_name: str, test_data: torch.Tensor, ): - self._test_meandim_tosa_u55_BI_pipeline(self.MeanDim(), (test_data,)) + self._test_meandim_tosa_ethosu_BI_pipeline( + self.MeanDim(), + common.get_u55_compile_spec(), + (test_data,), + ) + + @parameterized.expand(test_data_suite) + def test_meandim_tosa_u85_BI( + self, + test_name: str, + test_data: torch.Tensor, + ): + self._test_meandim_tosa_ethosu_BI_pipeline( + self.MeanDim(), + common.get_u85_compile_spec(), + (test_data,), + ) diff --git a/backends/arm/test/ops/test_mm.py b/backends/arm/test/ops/test_mm.py index 9a9b3ef579b..4271496eaa9 100644 --- a/backends/arm/test/ops/test_mm.py +++ b/backends/arm/test/ops/test_mm.py @@ -12,6 +12,7 @@ import torch from executorch.backends.arm.test import common from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.exir.backend.backend_details import CompileSpec from parameterized import parameterized logger = logging.getLogger(__name__) @@ -87,14 +88,17 @@ def _test_mm_tosa_BI_pipeline( .run_method_and_compare_outputs(inputs=test_data) ) - def _test_mm_u55_BI_pipeline( - self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] + def _test_mm_ethosu_BI_pipeline( + self, + compile_spec: CompileSpec, + module: torch.nn.Module, + test_data: Tuple[torch.Tensor], ): ( ArmTester( module, example_inputs=test_data, - compile_spec=common.get_u55_compile_spec(), + compile_spec=compile_spec, ) .quantize() .export() @@ -131,11 +135,29 @@ def test_mm_single_input_tosa_BI(self, operand1: torch.Tensor): @unittest.expectedFailure def test_mm_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor): test_data = (operand1, operand2) - self._test_mm_u55_BI_pipeline(self.MM(), test_data) + self._test_mm_ethosu_BI_pipeline( + common.get_u55_compile_spec(), self.MM(), test_data + ) # Expected to fail with error: Warning, unsupported fusing of TOSA Rescale previous operator is of type: Memcpy @parameterized.expand(MMSingleInput.test_parameters) @unittest.expectedFailure def test_mm_single_input_u55_BI(self, operand1: torch.Tensor): test_data = (operand1,) - self._test_mm_u55_BI_pipeline(self.MMSingleInput(), test_data) + self._test_mm_ethosu_BI_pipeline( + common.get_u55_compile_spec(), self.MMSingleInput(), test_data + ) + + @parameterized.expand(MM.test_parameters) + def test_mm_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor): + test_data = (operand1, operand2) + self._test_mm_ethosu_BI_pipeline( + common.get_u85_compile_spec(), self.MM(), test_data + ) + + @parameterized.expand(MMSingleInput.test_parameters) + def test_mm_single_input_u85_BI(self, operand1: torch.Tensor): + test_data = (operand1,) + self._test_mm_ethosu_BI_pipeline( + common.get_u85_compile_spec(), self.MMSingleInput(), test_data + ) diff --git a/backends/arm/test/ops/test_mul.py b/backends/arm/test/ops/test_mul.py index dee8b62f1b2..a1c2dba5fed 100644 --- a/backends/arm/test/ops/test_mul.py +++ b/backends/arm/test/ops/test_mul.py @@ -10,6 +10,7 @@ import torch from executorch.backends.arm.test import common from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.exir.backend.backend_details import CompileSpec from parameterized import parameterized test_data_sute = [ @@ -101,14 +102,17 @@ def _test_mul_tosa_BI_pipeline( .run_method_and_compare_outputs(inputs=test_data, qtol=1.0) ) - def _test_mul_u55_BI_pipeline( - self, module: torch.nn.Module, test_data: tuple[torch.Tensor, torch.Tensor] + def _test_mul_ethosu_BI_pipeline( + self, + compile_spec: CompileSpec, + module: torch.nn.Module, + test_data: tuple[torch.Tensor, torch.Tensor], ): ( ArmTester( module, example_inputs=test_data, - compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=True), + compile_spec=compile_spec, ) .quantize() .export() @@ -141,9 +145,7 @@ def test_mul_tosa_BI( test_data = (input_, other_) self._test_mul_tosa_BI_pipeline(self.Mul(), test_data) - # Expected to fail since RESCALE cannot be fused with MUL in Vela. @parameterized.expand(test_data_sute) - @unittest.expectedFailure def test_mul_u55_BI( self, test_name: str, @@ -151,4 +153,18 @@ def test_mul_u55_BI( other_: torch.Tensor, ): test_data = (input_, other_) - self._test_mul_u55_BI_pipeline(self.Mul(), test_data) + self._test_mul_ethosu_BI_pipeline( + common.get_u55_compile_spec(), self.Mul(), test_data + ) + + @parameterized.expand(test_data_sute) + def test_mul_u85_BI( + self, + test_name: str, + input_: torch.Tensor, + other_: torch.Tensor, + ): + test_data = (input_, other_) + self._test_mul_ethosu_BI_pipeline( + common.get_u85_compile_spec(), self.Mul(), test_data + ) diff --git a/backends/arm/test/ops/test_relu.py b/backends/arm/test/ops/test_relu.py new file mode 100644 index 00000000000..effbccc74d5 --- /dev/null +++ b/backends/arm/test/ops/test_relu.py @@ -0,0 +1,132 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +from typing import Tuple + +import torch +from executorch.backends.arm.quantizer.arm_quantizer import ( + ArmQuantizer, + get_symmetric_quantization_config, +) +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.backends.xnnpack.test.tester.tester import Quantize +from executorch.exir.backend.backend_details import CompileSpec +from parameterized import parameterized + + +test_data_suite = [ + # (test_name, test_data) + ("zeros", torch.zeros(1, 10, 10, 10)), + ("ones", torch.ones(10, 10, 10)), + ("rand", torch.rand(10, 10) - 0.5), + ("randn_pos", torch.randn(10) + 10), + ("randn_neg", torch.randn(10) - 10), + ("ramp", torch.arange(-16, 16, 0.2)), +] + + +class TestRelu(unittest.TestCase): + class Relu(torch.nn.Module): + def __init__(self): + super().__init__() + self.relu = torch.nn.ReLU() + + def forward(self, x): + return self.relu(x) + + def _test_relu_tosa_MI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .export() + .check(["torch.ops.aten.relu.default"]) + .check_not(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_relu_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_relu_tosa_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + ): + quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize(Quantize(quantizer, get_symmetric_quantization_config())) + .export() + .check_count({"torch.ops.aten.relu.default": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_relu_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_relu_ethosu_BI_pipeline( + self, + compile_spec: CompileSpec, + module: torch.nn.Module, + test_data: Tuple[torch.tensor], + ): + quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=compile_spec, + ) + .quantize(Quantize(quantizer, get_symmetric_quantization_config())) + .export() + .check_count({"torch.ops.aten.relu.default": 1}) + .check(["torch.ops.quantized_decomposed"]) + .to_edge() + .partition() + .check_not(["executorch_exir_dialects_edge__ops_aten_relu_default"]) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + ) + + @parameterized.expand(test_data_suite) + def test_relu_tosa_MI( + self, + test_name: str, + test_data: torch.Tensor, + ): + self._test_relu_tosa_MI_pipeline(self.Relu(), (test_data,)) + + @parameterized.expand(test_data_suite) + def test_relu_tosa_BI(self, test_name: str, test_data: torch.Tensor): + self._test_relu_tosa_BI_pipeline(self.Relu(), (test_data,)) + + @parameterized.expand(test_data_suite) + def test_relu_u55_BI(self, test_name: str, test_data: torch.Tensor): + self._test_relu_ethosu_BI_pipeline( + common.get_u55_compile_spec(), self.Relu(), (test_data,) + ) + + @parameterized.expand(test_data_suite) + def test_relu_u85_BI(self, test_name: str, test_data: torch.Tensor): + self._test_relu_ethosu_BI_pipeline( + common.get_u85_compile_spec(), self.Relu(), (test_data,) + ) diff --git a/backends/arm/test/ops/test_repeat.py b/backends/arm/test/ops/test_repeat.py index a6fad033456..542f0d6256b 100644 --- a/backends/arm/test/ops/test_repeat.py +++ b/backends/arm/test/ops/test_repeat.py @@ -21,6 +21,7 @@ from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.backends.xnnpack.test.tester.tester import Quantize +from executorch.exir.backend.backend_details import CompileSpec from parameterized import parameterized @@ -77,13 +78,15 @@ def _test_repeat_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tupl .run_method_and_compare_outputs(inputs=test_data, qtol=1) ) - def _test_repeat_tosa_u55_pipeline(self, module: torch.nn.Module, test_data: Tuple): + def _test_repeat_ethosu_pipeline( + self, compile_spec: CompileSpec, module: torch.nn.Module, test_data: Tuple + ): quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) ( ArmTester( module, example_inputs=test_data, - compile_spec=common.get_u55_compile_spec(), + compile_spec=compile_spec, ) .quantize(Quantize(quantizer, get_symmetric_quantization_config())) .export() @@ -103,8 +106,16 @@ def test_repeat_tosa_MI(self, test_input, multiples): def test_repeat_tosa_BI(self, test_input, multiples): self._test_repeat_tosa_BI_pipeline(self.Repeat(), (test_input, multiples)) - # Expected failure since tosa.TILE is unsupported by Vela. @parameterized.expand(Repeat.test_parameters) - @unittest.expectedFailure + @unittest.expectedFailure # TODO: MLBEDSW-9386 def test_repeat_u55_BI(self, test_input, multiples): - self._test_repeat_tosa_u55_pipeline(self.Repeat(), (test_input, multiples)) + self._test_repeat_ethosu_pipeline( + common.get_u55_compile_spec(), self.Repeat(), (test_input, multiples) + ) + + @parameterized.expand(Repeat.test_parameters) + @unittest.expectedFailure # TODO: MLBEDSW-9386 + def test_repeat_u85_BI(self, test_input, multiples): + self._test_repeat_ethosu_pipeline( + common.get_u85_compile_spec(), self.Repeat(), (test_input, multiples) + ) diff --git a/backends/arm/test/ops/test_sigmoid.py b/backends/arm/test/ops/test_sigmoid.py index 7a0435689f4..f75583164c1 100644 --- a/backends/arm/test/ops/test_sigmoid.py +++ b/backends/arm/test/ops/test_sigmoid.py @@ -13,6 +13,7 @@ import torch from executorch.backends.arm.test import common from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.exir.backend.compile_spec_schema import CompileSpec from parameterized import parameterized logger = logging.getLogger(__name__) @@ -102,14 +103,17 @@ def _test_sigmoid_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tup .run_method_and_compare_outputs(inputs=test_data) ) - def _test_sigmoid_tosa_u55_BI_pipeline( - self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + def _test_sigmoid_tosa_ethos_BI_pipeline( + self, + compile_spec: list[CompileSpec], + module: torch.nn.Module, + test_data: Tuple[torch.tensor], ): ( ArmTester( module, example_inputs=test_data, - compile_spec=common.get_u55_compile_spec(), + compile_spec=compile_spec, ) .quantize() .export() @@ -122,6 +126,20 @@ def _test_sigmoid_tosa_u55_BI_pipeline( .to_executorch() ) + def _test_sigmoid_tosa_u55_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + ): + self._test_sigmoid_tosa_ethos_BI_pipeline( + common.get_u55_compile_spec(), module, test_data + ) + + def _test_sigmoid_tosa_u85_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + ): + self._test_sigmoid_tosa_ethos_BI_pipeline( + common.get_u85_compile_spec(), module, test_data + ) + @parameterized.expand(test_data_suite) def test_sigmoid_tosa_MI( self, @@ -145,8 +163,10 @@ def test_sigmoid_add_sigmoid_tosa_BI(self): self.SigmoidAddSigmoid(), (test_data_suite[4][1], test_data_suite[3][1]) ) - # Fails due to Vela diff from Tosa spec, expected to work with Regor. @parameterized.expand(test_data_suite) - @unittest.expectedFailure def test_sigmoid_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor): self._test_sigmoid_tosa_u55_BI_pipeline(self.Sigmoid(), (test_data,)) + + @parameterized.expand(test_data_suite) + def test_sigmoid_tosa_u85_BI(self, test_name: str, test_data: torch.Tensor): + self._test_sigmoid_tosa_u85_BI_pipeline(self.Sigmoid(), (test_data,)) diff --git a/backends/arm/test/ops/test_slice.py b/backends/arm/test/ops/test_slice.py index 14874df156e..ca026c7f420 100644 --- a/backends/arm/test/ops/test_slice.py +++ b/backends/arm/test/ops/test_slice.py @@ -15,6 +15,7 @@ from executorch.backends.arm.test import common from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.backends.xnnpack.test.tester.tester import Quantize +from executorch.exir.backend.compile_spec_schema import CompileSpec from parameterized import parameterized @@ -77,8 +78,11 @@ def _test_slice_tosa_BI_pipeline( .run_method_and_compare_outputs(inputs=test_data, qtol=1) ) - def _test_slice_u55_BI_pipeline( - self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] + def _test_slice_ethos_BI_pipeline( + self, + compile_spec: list[CompileSpec], + module: torch.nn.Module, + test_data: Tuple[torch.Tensor], ): quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) ( @@ -96,6 +100,20 @@ def _test_slice_u55_BI_pipeline( .to_executorch() ) + def _test_slice_u55_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] + ): + self._test_slice_ethos_BI_pipeline( + common.get_u55_compile_spec(), module, test_data + ) + + def _test_slice_u85_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] + ): + self._test_slice_ethos_BI_pipeline( + common.get_u85_compile_spec(), module, test_data + ) + @parameterized.expand(Slice.test_tensors) def test_slice_tosa_MI(self, tensor): self._test_slice_tosa_MI_pipeline(self.Slice(), (tensor,)) @@ -108,9 +126,10 @@ def test_slice_nchw_tosa_BI(self, test_tensor: torch.Tensor): def test_slice_nhwc_tosa_BI(self, test_tensor: torch.Tensor): self._test_slice_tosa_BI_pipeline(self.Slice(), (test_tensor,), True) - # Fails during Vela compilation when trying to use a Tuple as a Named tuple, - # Could be Vela Issue, wait until Regor. @parameterized.expand(Slice.test_tensors) - @unittest.expectedFailure def test_slice_u55_BI(self, test_tensor: torch.Tensor): self._test_slice_u55_BI_pipeline(self.Slice(), (test_tensor,)) + + @parameterized.expand(Slice.test_tensors) + def test_slice_u85_BI(self, test_tensor: torch.Tensor): + self._test_slice_u85_BI_pipeline(self.Slice(), (test_tensor,)) diff --git a/backends/arm/test/ops/test_softmax.py b/backends/arm/test/ops/test_softmax.py index b3b6230daa7..a7d25d266de 100644 --- a/backends/arm/test/ops/test_softmax.py +++ b/backends/arm/test/ops/test_softmax.py @@ -5,7 +5,6 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import logging import unittest from typing import Tuple @@ -13,17 +12,20 @@ import torch from executorch.backends.arm.test import common from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.exir.backend.compile_spec_schema import CompileSpec from parameterized import parameterized -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) test_data_suite = [ # (test_name, test_data, dim) - ("zeros", torch.zeros(10, 10, 10, 10), 1), + ("zeros", torch.zeros(10, 10, 10, 10), 0), + ("zeros_neg_dim", torch.zeros(10, 10, 10, 10), -4), ("ones", torch.ones(10, 10, 10, 10), 1), + ("ones_neg_dim", torch.ones(10, 10, 10, 10), -1), ("rand", torch.rand(10, 10, 10, 10), 2), + ("rand_neg_dim", torch.rand(10, 10, 10, 10), -2), ("randn", torch.randn(10, 10, 10, 10), 3), + ("randn_neg_dim", torch.randn(10, 10, 10, 10), -3), ] @@ -79,14 +81,17 @@ def _test_softmax_tosa_BI_pipeline( .run_method_and_compare_outputs(inputs=test_data, qtol=1) ) - def _test_softmax_tosa_u55_BI_pipeline( - self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + def _test_softmax_tosa_ethos_BI_pipeline( + self, + compile_spec: list[CompileSpec], + module: torch.nn.Module, + test_data: Tuple[torch.tensor], ): ( ArmTester( module, example_inputs=test_data, - compile_spec=common.get_u55_compile_spec(), + compile_spec=compile_spec, ) .quantize() .export() @@ -99,6 +104,20 @@ def _test_softmax_tosa_u55_BI_pipeline( .to_executorch() ) + def _test_softmax_tosa_u55_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + ): + self._test_softmax_tosa_ethos_BI_pipeline( + common.get_u55_compile_spec(), module, test_data + ) + + def _test_softmax_tosa_u85_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + ): + self._test_softmax_tosa_ethos_BI_pipeline( + common.get_u85_compile_spec(), module, test_data + ) + @parameterized.expand(test_data_suite) def test_softmax_tosa_MI( self, @@ -131,3 +150,13 @@ def test_softmax_tosa_u55_BI( dim: int, ): self._test_softmax_tosa_u55_BI_pipeline(self.Softmax(dim=dim), (test_data,)) + + @parameterized.expand(test_data_suite) + @unittest.expectedFailure + def test_softmax_tosa_u85_BI( + self, + test_name: str, + test_data: torch.Tensor, + dim: int, + ): + self._test_softmax_tosa_u85_BI_pipeline(self.Softmax(dim=dim), (test_data,)) diff --git a/backends/arm/test/ops/test_split.py b/backends/arm/test/ops/test_split.py index bc998179c0c..02133d4e7f4 100644 --- a/backends/arm/test/ops/test_split.py +++ b/backends/arm/test/ops/test_split.py @@ -14,6 +14,7 @@ from executorch.backends.arm.test import common from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.backends.xnnpack.test.tester.tester import Quantize +from executorch.exir.backend.compile_spec_schema import CompileSpec from parameterized import parameterized test_data_t = tuple[torch.Tensor, int | list[int], int] @@ -94,15 +95,15 @@ def _test_split_tosa_BI_pipeline( .run_method_and_compare_outputs(inputs=test_data, qtol=1) ) - def _test_split_u55_BI_pipeline( - self, module: torch.nn.Module, test_data: test_data_t + def _test_split_ethosu_BI_pipeline( + self, compile_spec: CompileSpec, module: torch.nn.Module, test_data: test_data_t ): quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) ( ArmTester( module, example_inputs=test_data, - compile_spec=common.get_u55_compile_spec(), + compile_spec=compile_spec, ) .quantize(Quantize(quantizer, get_symmetric_quantization_config())) .export() @@ -131,9 +132,33 @@ def test_split_n_out_tosa_MI(self, test_data: test_data_t): def test_split_tosa_BI(self, test_data: test_data_t): self._test_split_tosa_BI_pipeline(self.Split(), test_data) - # Fails during Vela compilation when trying to use a Tuple as a Named tuple, - # Could be Vela Issue, wait until Regor. - @parameterized.expand(Split.test_data) - @unittest.expectedFailure + @parameterized.expand( + [Split.test_data[0], Split.test_data[1], Split.test_data[2], Split.test_data[4]] + ) def test_split_u55_BI(self, test_data: test_data_t): - self._test_split_u55_BI_pipeline(self.Split(), test_data) + self._test_split_ethosu_BI_pipeline( + common.get_u55_compile_spec(), self.Split(), test_data + ) + + # TODO MLETORCH-350 + @parameterized.expand([Split.test_data[3], Split.test_data[5]]) + @unittest.expectedFailure + def test_split_u55_BI_skip(self, test_data: test_data_t): + self._test_split_ethosu_BI_pipeline( + common.get_u55_compile_spec(), self.Split(), test_data + ) + + @parameterized.expand( + [Split.test_data[0], Split.test_data[1], Split.test_data[2], Split.test_data[4]] + ) + def test_split_u85_BI(self, test_data: test_data_t): + self._test_split_ethosu_BI_pipeline( + common.get_u85_compile_spec(), self.Split(), test_data + ) + + @parameterized.expand([Split.test_data[3], Split.test_data[5]]) + @unittest.expectedFailure + def test_split_u85_BI_skip(self, test_data: test_data_t): + self._test_split_ethosu_BI_pipeline( + common.get_u85_compile_spec(), self.Split(), test_data + ) diff --git a/backends/arm/test/ops/test_sub.py b/backends/arm/test/ops/test_sub.py index 2ae7c3ab36f..e80c0436989 100644 --- a/backends/arm/test/ops/test_sub.py +++ b/backends/arm/test/ops/test_sub.py @@ -13,6 +13,7 @@ from executorch.backends.arm.test import common from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.exir.backend.compile_spec_schema import CompileSpec from parameterized import parameterized @@ -75,14 +76,17 @@ def _test_sub_tosa_BI_pipeline( .run_method_and_compare_outputs(inputs=test_data, qtol=1) ) - def _test_sub_u55_BI_pipeline( - self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] + def _test_sub_ethosu_BI_pipeline( + self, + compile_spec: list[CompileSpec], + module: torch.nn.Module, + test_data: Tuple[torch.Tensor], ): ( ArmTester( module, example_inputs=test_data, - compile_spec=common.get_u55_compile_spec(), + compile_spec=compile_spec, ) .quantize() .export() @@ -104,14 +108,40 @@ def test_sub_tosa_BI(self, test_data: torch.Tensor): test_data = (test_data,) self._test_sub_tosa_BI_pipeline(self.Sub(), test_data) - # Expected to fail since RESCALE cannot be fused with SUB in Vela. @parameterized.expand(Sub.test_parameters) - @unittest.expectedFailure def test_sub_u55_BI(self, test_data: torch.Tensor): test_data = (test_data,) - self._test_sub_u55_BI_pipeline(self.Sub(), test_data) + self._test_sub_ethosu_BI_pipeline( + common.get_u55_compile_spec(), self.Sub(), test_data + ) + + @parameterized.expand(Sub.test_parameters) + def test_sub_u85_BI(self, test_data: torch.Tensor): + test_data = (test_data,) + self._test_sub_ethosu_BI_pipeline( + common.get_u85_compile_spec(), self.Sub(), test_data + ) @parameterized.expand(Sub2.test_parameters) def test_sub2_tosa_MI(self, operand1: torch.Tensor, operand2: torch.Tensor): test_data = (operand1, operand2) self._test_sub_tosa_MI_pipeline(self.Sub2(), test_data) + + @parameterized.expand(Sub2.test_parameters) + def test_sub2_tosa_BI(self, operand1: torch.Tensor, operand2: torch.Tensor): + test_data = (operand1, operand2) + self._test_sub_tosa_BI_pipeline(self.Sub2(), test_data) + + @parameterized.expand(Sub2.test_parameters) + def test_sub2_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor): + test_data = (operand1, operand2) + self._test_sub_ethosu_BI_pipeline( + common.get_u55_compile_spec(), self.Sub2(), test_data + ) + + @parameterized.expand(Sub2.test_parameters) + def test_sub2_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor): + test_data = (operand1, operand2) + self._test_sub_ethosu_BI_pipeline( + common.get_u85_compile_spec(), self.Sub2(), test_data + ) diff --git a/backends/arm/test/ops/test_unsqueeze.py b/backends/arm/test/ops/test_unsqueeze.py new file mode 100644 index 00000000000..9c79d4371c3 --- /dev/null +++ b/backends/arm/test/ops/test_unsqueeze.py @@ -0,0 +1,115 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# +# Tests the unsqueeze op which copies the data of the input tensor (possibly with new data format) +# + +import unittest +from typing import Sequence, Tuple + +import torch + +from executorch.backends.arm.quantizer.arm_quantizer import ( + ArmQuantizer, + get_symmetric_quantization_config, +) +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester + +from executorch.backends.xnnpack.test.tester.tester import Quantize +from executorch.exir.backend.compile_spec_schema import CompileSpec +from parameterized import parameterized + + +class TestSimpleUnsqueeze(unittest.TestCase): + class Unsqueeze(torch.nn.Module): + shapes: list[int | Sequence[int]] = [5, (5, 5), (5, 5), (5, 5, 5)] + test_parameters: list[tuple[torch.Tensor]] = [(torch.ones(n),) for n in shapes] + + def forward(self, x: torch.Tensor, dim): + return x.unsqueeze(dim) + + def _test_unsqueeze_tosa_MI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.Tensor, int] + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .export() + .check_count({"torch.ops.aten.unsqueeze.default": 1}) + .to_edge() + .partition() + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + def _test_unsqueeze_tosa_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.Tensor, int] + ): + quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize(Quantize(quantizer, get_symmetric_quantization_config())) + .export() + .check_count({"torch.ops.aten.unsqueeze.default": 1}) + .to_edge() + .partition() + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data, qtol=1) + ) + + def _test_unsqueeze_ethosu_BI_pipeline( + self, + compile_spec: CompileSpec, + module: torch.nn.Module, + test_data: Tuple[torch.Tensor, int], + ): + quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=compile_spec, + ) + .quantize(Quantize(quantizer, get_symmetric_quantization_config())) + .export() + .check_count({"torch.ops.aten.unsqueeze.default": 1}) + .to_edge() + .partition() + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + ) + + @parameterized.expand(Unsqueeze.test_parameters) + def test_unsqueeze_tosa_MI(self, test_tensor: torch.Tensor): + for i in range(-test_tensor.dim() - 1, test_tensor.dim() + 1): + self._test_unsqueeze_tosa_MI_pipeline(self.Unsqueeze(), (test_tensor, i)) + + @parameterized.expand(Unsqueeze.test_parameters) + def test_unsqueeze_tosa_BI(self, test_tensor: torch.Tensor): + self._test_unsqueeze_tosa_BI_pipeline(self.Unsqueeze(), (test_tensor, 0)) + + @parameterized.expand(Unsqueeze.test_parameters) + def test_unsqueeze_u55_BI(self, test_tensor: torch.Tensor): + self._test_unsqueeze_ethosu_BI_pipeline( + common.get_u55_compile_spec(), self.Unsqueeze(), (test_tensor, 0) + ) + + @parameterized.expand(Unsqueeze.test_parameters) + def test_unsqueeze_u85_BI(self, test_tensor: torch.Tensor): + self._test_unsqueeze_ethosu_BI_pipeline( + common.get_u85_compile_spec(), self.Unsqueeze(), (test_tensor, 0) + ) diff --git a/backends/arm/test/ops/test_view.py b/backends/arm/test/ops/test_view.py index 1f51261bf7a..53025c0ac08 100644 --- a/backends/arm/test/ops/test_view.py +++ b/backends/arm/test/ops/test_view.py @@ -21,6 +21,7 @@ from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.backends.xnnpack.test.tester.tester import Quantize +from executorch.exir.backend.compile_spec_schema import CompileSpec from parameterized import parameterized @@ -73,8 +74,11 @@ def _test_view_tosa_BI_pipeline( .run_method_and_compare_outputs(inputs=test_data, qtol=1) ) - def _test_view_u55_BI_pipeline( - self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] + def _test_view_ethos_BI_pipeline( + self, + compile_spec: list[CompileSpec], + module: torch.nn.Module, + test_data: Tuple[torch.Tensor], ): quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) ( @@ -92,6 +96,20 @@ def _test_view_u55_BI_pipeline( .to_executorch() ) + def _test_view_u55_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] + ): + self._test_view_ethos_BI_pipeline( + common.get_u55_compile_spec(), module, test_data + ) + + def _test_view_u85_BI_pipeline( + self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] + ): + self._test_view_ethos_BI_pipeline( + common.get_u85_compile_spec(), module, test_data + ) + @parameterized.expand(View.test_parameters) def test_view_tosa_MI(self, test_tensor: torch.Tensor): self._test_view_tosa_MI_pipeline(self.View(), (test_tensor,)) @@ -103,3 +121,7 @@ def test_view_tosa_BI(self, test_tensor: torch.Tensor): @parameterized.expand(View.test_parameters) def test_view_u55_BI(self, test_tensor: torch.Tensor): self._test_view_u55_BI_pipeline(self.View(), (test_tensor,)) + + @parameterized.expand(View.test_parameters) + def test_view_u85_BI(self, test_tensor: torch.Tensor): + self._test_view_u85_BI_pipeline(self.View(), (test_tensor,)) diff --git a/backends/arm/test/passes/test_meandim_to_averagepool2d.py b/backends/arm/test/passes/test_meandim_to_averagepool2d.py new file mode 100644 index 00000000000..1cd63e6e52e --- /dev/null +++ b/backends/arm/test/passes/test_meandim_to_averagepool2d.py @@ -0,0 +1,75 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +from executorch.backends.arm.passes.meandim_to_averagepool_pass import ( + ConvertMeanDimToAveragePool, +) + +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester + +from executorch.backends.xnnpack.test.tester.tester import RunPasses + + +class MeanDim(torch.nn.Module): + def forward(self, x): + return torch.mean(x, dim=[-1, -2], keepdim=True) + + def get_inputs(self): + return (torch.rand(1, 1280, 7, 7),) + + +class MeanDim2(torch.nn.Module): + def forward(self, x): + return torch.mean(x, dim=1) + + def get_inputs(self): + return (torch.rand(1, 1280, 7, 7),) + + +class TestMeandimToAveragePool2dPass(unittest.TestCase): + """ + Tests the MeanDimToAveragePool2dPass which converts mean.dim to average_pool2d + for the special case where dim is [-1, -2] and keepdim is True. + """ + + def test_tosa_BI_meandim_to_averagepool(self): + module = MeanDim() + test_pass_stage = RunPasses([ConvertMeanDimToAveragePool]) + ( + ArmTester( + module, + example_inputs=module.get_inputs(), + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize() + .export() + .to_edge() + .check(["executorch_exir_dialects_edge__ops_aten_mean_dim"]) + .run_passes(test_pass_stage) + .check(["executorch_exir_dialects_edge__ops_aten_avg_pool2d_default"]) + ) + + def test_tosa_BI_meandim_no_modification(self): + module = MeanDim2() + test_pass_stage = RunPasses([ConvertMeanDimToAveragePool]) + ( + ArmTester( + module, + example_inputs=module.get_inputs(), + compile_spec=common.get_tosa_compile_spec(), + ) + .quantize() + .export() + .to_edge() + .check(["executorch_exir_dialects_edge__ops_aten_mean_dim"]) + .run_passes(test_pass_stage) + .check(["executorch_exir_dialects_edge__ops_aten_mean_dim"]) + .check_not(["executorch_exir_dialects_edge__ops_aten_avg_pool2d_default"]) + ) diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py index 4e3b447103c..6e8b9b25ede 100644 --- a/backends/arm/test/runner_utils.py +++ b/backends/arm/test/runner_utils.py @@ -10,6 +10,7 @@ import subprocess import tempfile +from pathlib import Path from typing import Dict, List, Optional, Tuple import numpy as np @@ -265,9 +266,12 @@ def run_corstone300( raise RuntimeError( f"Corstone simulation failed, log: \n {result_stdout}\n{result.stderr.decode()}" ) + elif "E [" in result_stdout: + logger.error(result_stdout) tosa_ref_output = np.fromfile(out_path_with_suffix, dtype=np.float32) - tosa_ref_output = torch.from_numpy(tosa_ref_output).reshape(inputs[0].shape) + output_shape = self.output_node.args[0][0].meta["val"].shape + tosa_ref_output = torch.from_numpy(tosa_ref_output).reshape(output_shape) return [tosa_ref_output] def run_tosa_ref_model( @@ -275,10 +279,10 @@ def run_tosa_ref_model( inputs: Tuple[torch.Tensor], ) -> list[torch.Tensor]: """ - Run TOSA reference model using the tosa_refence_model program. + Run TOSA reference model using the tosa_reference_model program. In order to do that we need: - 1. desc.json, which points to files needed by tosa_refence_model. + 1. desc.json, which points to files needed by tosa_reference_model. 2. output.tosa, which is the TOSA buffer that describes the model we're trying to run. @@ -287,12 +291,6 @@ def run_tosa_ref_model( All these files are saved on disk in self.intermediate_path. Args: - params_input (Tuple[List[str], List[QuantizationParams]]): A tuple - containing a list of input node names and a list of their - quantization parameters (if model is quantized). - param_output (Tuple[str, QuantizationParams]): A tuple containing - the output node name and its quantization parameters (if - model is quantized). inputs (Tuple[torch.Tensor]): The input data to run the TOSA Returns: @@ -328,7 +326,18 @@ def run_tosa_ref_model( self._has_init_run ), "RunnerUtil needs to be initialized using init_run() before running tosa reference." - desc_file_path = os.path.join(self.intermediate_path, "desc.json") + all_desc_file_paths = [ + str(path) for path in Path(self.intermediate_path).glob("desc*.json") + ] + assert ( + all_desc_file_paths + ), f"No TOSA description file found in '{self.intermediate_path}'." + if len(all_desc_file_paths) != 1: + raise NotImplementedError( + "Graphs with more than one partition are currently not supported." + ) + + desc_file_path = all_desc_file_paths[0] assert os.path.exists( desc_file_path ), f"desc_file_path: {desc_file_path} does not exist" @@ -423,7 +432,7 @@ def save_npy( Parameters: path: the directory where to save the data. data: the data to save. - is_quantize: whether to quantize the data before saving it. + is_quantized: whether to quantize the data before saving it. input_name: the name of the file, without file-ending. quant_param: the parameters to use for quantization. Returns: @@ -448,7 +457,7 @@ def save_bytes( Parameters: path: the directory where to save the data. data: the data to save. - is_quantize: whether to quantize the data before saving it. + is_quantized: whether to quantize the data before saving it. input_name: the name of the file, without file-ending. quant_param: the parameters to use for quantization. Returns: diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py index 41fc907fdfe..2fe8c07e7d1 100644 --- a/backends/arm/test/tester/arm_tester.py +++ b/backends/arm/test/tester/arm_tester.py @@ -34,6 +34,7 @@ from executorch.backends.xnnpack.test.tester import Tester from executorch.exir import EdgeCompileConfig from executorch.exir.backend.compile_spec_schema import CompileSpec +from executorch.exir.lowered_backend_module import LoweredBackendModule from torch.fx import Graph logger = logging.getLogger(__name__) @@ -44,21 +45,42 @@ class Partition(tester.Partition): def dump_artifact(self, path_to_dump: Optional[str]): super().dump_artifact(path_to_dump) - to_print = None - for spec in self.graph_module.lowered_module_0.compile_specs: - if spec.key == "output_format": - if spec.value == b"tosa": - tosa_fb = self.graph_module.lowered_module_0.processed_bytes + def get_output_format(lowered_module) -> str | None: + for spec in lowered_module.compile_specs: + if spec.key == "output_format": + return spec.value.decode() + return None + + output = "" + for node in self.graph_module.graph.nodes: + if node.op == "get_attr" and node.name.startswith("lowered_module_"): + lowered_module = getattr(self.graph_module, node.name) + assert isinstance( + lowered_module, LoweredBackendModule + ), f"Attribute {node.name} must be of type LoweredBackendModule." + + output_format = get_output_format(lowered_module) + if output_format == "tosa": + tosa_fb = lowered_module.processed_bytes to_print = dbg_tosa_fb_to_json(tosa_fb) to_print = pformat(to_print, compact=True, indent=1) - to_print = f"\n TOSA deserialized: \n{to_print}" - elif spec.value == b"vela": - vela_cmd_stream = self.graph_module.lowered_module_0.processed_bytes - to_print = str(vela_cmd_stream) - to_print = f"\n Vela command stream: \n{to_print}" - break - assert to_print is not None, "No TOSA nor Vela compile spec found" - _dump_str(to_print, path_to_dump) + output += f"\nTOSA deserialized {node.name}: \n{to_print}\n" + elif output_format == "vela": + vela_cmd_stream = lowered_module.processed_bytes + output += ( + f"\nVela command stream {node.name}: \n{vela_cmd_stream}\n" + ) + else: + logger.warning( + f"No TOSA nor Vela compile spec found in compile specs of {node.name}." + ) + continue + + if not output: + logger.warning("No output to print generated from artifact.") + return + + _dump_str(output, path_to_dump) class Serialize(tester.Serialize): @@ -242,16 +264,21 @@ def run_method_and_compare_outputs( # Loop inputs and compare reference stage with the compared stage. for run_iteration in range(num_runs): reference_input = inputs if inputs else next(self.generate_random_inputs()) - if is_nhwc: - test_input = self.transpose_data_format(reference_input, "NHWC") - else: - test_input = reference_input # Test parameters can include constants that are used in eager mode but are already set as attributes # in TOSA. Therefore, only accept torch.Tensor inputs. - test_input = [ - tensor for tensor in test_input if isinstance(tensor, torch.Tensor) - ] + test_input: list[torch.Tensor] = [] + for arg in reference_input: + if isinstance(arg, torch.Tensor): + test_input.append(arg) + if isinstance(arg, tuple) and isinstance(arg[0], torch.Tensor): + test_input.extend(list(arg)) + + if ( + is_nhwc + and test_stage == self.stages[self.stage_name(tester.ToExecutorch)] + ): + test_input = self.transpose_data_format(test_input, "NHWC") input_shapes = [ generated_input.shape if hasattr(generated_input, "shape") else (1,) @@ -261,7 +288,10 @@ def run_method_and_compare_outputs( reference_output = reference_stage.run_artifact(reference_input) test_output = tuple(test_stage.run_artifact(test_input)) - if is_nhwc: + if ( + is_nhwc + and test_stage == self.stages[self.stage_name(tester.ToExecutorch)] + ): test_output = self.transpose_data_format(test_output, "NCHW") self._compare_outputs( diff --git a/backends/arm/tosa_mapping.py b/backends/arm/tosa_mapping.py index 5749d1e2043..0baf3e2ec1b 100644 --- a/backends/arm/tosa_mapping.py +++ b/backends/arm/tosa_mapping.py @@ -3,6 +3,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + # # PyTorch to Tosa mapping - simple mapping functions and multi-type extraction # of key information. These are used by the initial compile stage which captures diff --git a/backends/arm/tosa_quant_utils.py b/backends/arm/tosa_quant_utils.py index c0d16d51b25..8a90e432a69 100644 --- a/backends/arm/tosa_quant_utils.py +++ b/backends/arm/tosa_quant_utils.py @@ -3,18 +3,21 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + # Utiliy functions for TOSA quantized lowerings import math -from typing import NamedTuple +from typing import NamedTuple, Sequence import numpy as np import serializer.tosa_serializer as ts import torch.fx +import tosa.Op as TosaOp from executorch.backends.arm.tosa_mapping import map_dtype, TosaArg from executorch.exir.dialects._ops import ops as exir_ops -from serializer.tosa_serializer import TosaOp, TosaSerializerTensor +from serializer.tosa_serializer import TosaSerializerTensor from torch.fx import Node q_op = exir_ops.edge.quantized_decomposed.quantize_per_tensor.default @@ -65,6 +68,7 @@ def is_quant_node(node: torch.fx.Node): def get_quant_node_dtype(node: torch.fx.Node): + # pyre-ignore[16]: Undefined attribute. if "tosa" in node.target.__name__: return node.meta["val"].dtype @@ -231,7 +235,7 @@ def build_rescale_from_int32( rescale_scale, is_scale32=True, is_double_round=False, -) -> TosaSerializerTensor: +) -> None: multiplier, shift = compute_multiplier_and_shift(rescale_scale) attr_rescale_output = ts.TosaSerializerAttribute() attr_rescale_output.RescaleAttribute( @@ -254,7 +258,7 @@ def build_rescale_from_int32( def rescale_nodes_to_int32( - nodes: list[Node], tosa_graph: ts.TosaSerializer + nodes: Sequence[Node], tosa_graph: ts.TosaSerializer ) -> tuple[list[TosaSerializerTensor], float]: """Rescales all 'nodes' to int32, adding suitable RESCALE ops to 'tosa_graph'. The scales are adjusted using the smallest scale of all 'nodes'. diff --git a/backends/arm/tosa_utils.py b/backends/arm/tosa_utils.py index f84e371279b..cfafac16760 100644 --- a/backends/arm/tosa_utils.py +++ b/backends/arm/tosa_utils.py @@ -3,9 +3,11 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import logging import os -from typing import Any, Dict +from typing import Any, cast, Dict import numpy as np import serializer.tosa_serializer as ts @@ -48,10 +50,10 @@ def dbg_node(node): # Output TOSA flatbuffer and test harness file -def dbg_tosa_dump(tosa_graph, path): - filename = "output.tosa" +def dbg_tosa_dump(tosa_graph: ts.TosaSerializer, path: str, suffix: str = ""): + filename = f"output{suffix}.tosa" - logger.info(f"Emitting debug output to {path}") + logger.info(f"Emitting debug output to: {path=}, {suffix=}") os.makedirs(path, exist_ok=True) @@ -63,7 +65,7 @@ def dbg_tosa_dump(tosa_graph, path): f.write(fb) assert os.path.exists(filepath_tosa_fb), "Failed to write TOSA flatbuffer" - filepath_desc_json = os.path.join(path, "desc.json") + filepath_desc_json = os.path.join(path, f"desc{suffix}.json") with open(filepath_desc_json, "w") as f: f.write(js) assert os.path.exists(filepath_desc_json), "Failed to write TOSA JSON" @@ -74,7 +76,7 @@ def dbg_fail(node, tosa_graph, path): logger.warn("Internal error due to poorly handled node:") dbg_node(node) logger.warn(f"Debug output captured in '{path}'.") - raise RuntimeError("TOSA Internal Error on node, enable logging for further info") + raise RuntimeError("TOSA Internal Error on node, enable logging for further info.") # Helper function to match TOSA's broadcasting rank requirement @@ -235,7 +237,7 @@ def build_avg_pool_2d_common( output_zp = 0 if is_quant_node: - input_zp = get_quant_node_args(node.args[0]).zp + input_zp = get_quant_node_args(cast(torch.fx.Node, node.args[0])).zp output_zp = get_quant_node_args(list(node.users)[0]).zp attr = ts.TosaSerializerAttribute() @@ -306,7 +308,9 @@ def process_call_function( ) # Visiting each Node + # pyre-ignore[16]: Undefined attribute. if node.target.__name__ in node_visitors: + # pyre-ignore[16]: Undefined attribute. node_visitors[node.target.__name__].define_node( node, tosa_graph, @@ -319,7 +323,10 @@ def process_call_function( def expand_dims( - tosa_graph: ts.TosaSerializer, input_node: TosaArg, dtype: ts.DType, dim: int + tosa_graph: ts.TosaSerializer, + input_node: TosaArg, + dtype: int, + dim: int, ) -> Any: """Inserts TOSA operators into the tosa_graph, that perform the equivalent of the expand_dims (a.k.a unsqueeze) operation. A new axis is created at the diff --git a/backends/cadence/CMakeLists.txt b/backends/cadence/CMakeLists.txt index f725655e0d6..d786142f085 100644 --- a/backends/cadence/CMakeLists.txt +++ b/backends/cadence/CMakeLists.txt @@ -27,8 +27,8 @@ set(_common_include_directories ${EXECUTORCH_ROOT}/..) set(TARGET_DIR reference) if(EXECUTORCH_NNLIB_OPT) -set(TARGET_DIR hifi) -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib) + set(TARGET_DIR hifi) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib) endif() set(CMAKE_EXPORT_COMPILE_COMMANDS ON) @@ -68,7 +68,7 @@ target_include_directories( target_include_directories( cadence_runner PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR} - ${_common_include_directories} + ${_common_include_directories} ) target_link_libraries( diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS index d077169022a..08093efe317 100644 --- a/backends/cadence/aot/TARGETS +++ b/backends/cadence/aot/TARGETS @@ -60,6 +60,17 @@ python_library( ], ) +python_library( + name = "ops_registrations", + srcs = [ + "ops_registrations.py", + ], + deps = [ + "fbcode//caffe2:torch", + "fbcode//executorch/backends/cadence/aot:utils", + ], +) + export_file(name = "functions.yaml") executorch_generated_lib( diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py index 509e254b550..e1494f8d20d 100644 --- a/backends/cadence/aot/compiler.py +++ b/backends/cadence/aot/compiler.py @@ -18,12 +18,13 @@ ReplaceLogicalNotBooleanWhereWithWherePass, ReplacePT2DequantWithCadenceDequantPass, ReplacePT2QuantWithCadenceQuantPass, + ReplaceSafeSoftmaxWithSoftmax, ReplaceScalarTensorWithFullPass, ReplaceSqueezeAndUnsqueezeWithViewPass, ) from executorch.backends.cadence.aot.quantizer.fusion_pass import QuantFusion from executorch.backends.cadence.aot.quantizer.quantizer import CadenceQuantizer -from executorch.backends.cadence.aot.utils import model_is_quantized +from executorch.backends.cadence.aot.utils import model_gm_has_SDPA, model_is_quantized from executorch.backends.transforms.decompose_sdpa import ( DecomposeScaledDotProductAttention, ) @@ -57,13 +58,20 @@ def convert_pt2( """ # Export with dynamo - model_exp = capture_pre_autograd_graph(model, inputs) + model_gm = capture_pre_autograd_graph(model, inputs) - # Decompose SDPA - DecomposeScaledDotProductAttention(False)(model_exp) + if model_gm_has_SDPA(model_gm): # pyre-fixme[6] + # Decompose SDPA + DecomposeScaledDotProductAttention(False)(model_gm) # pyre-fixme[6] + + # Swap _safe_softmax with _softmax (see https://github.com/pytorch/pytorch/pull/133882 + # for details). + result = ReplaceSafeSoftmaxWithSoftmax()(model_gm) # pyre-fixme[6] + assert result is not None + model_gm = result.graph_module # Prepare - prepared_model = prepare_pt2e(model_exp, quantizer) + prepared_model = prepare_pt2e(model_gm, quantizer) # Calibrate prepared_model(*inputs) diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py index a4d856ebed2..e73de6ab7ce 100644 --- a/backends/cadence/aot/ops_registrations.py +++ b/backends/cadence/aot/ops_registrations.py @@ -4,12 +4,13 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-strict + from math import prod from typing import Optional, Tuple import torch -from executorch.exir.scalar_type import ScalarType -from torch.library import impl, Library +from torch.library import Library, register_fake from .utils import get_conv1d_output_size, get_conv2d_output_size @@ -67,31 +68,31 @@ m = Library("cadence", "IMPL", "Meta") -@impl(m, "quantize_per_tensor") +@register_fake("cadence::quantize_per_tensor") def quantize_per_tensor_meta( input: torch.Tensor, scale: float, zero_point: int, quant_min: int, quant_max: int, - dtype: ScalarType, -): + dtype: torch.dtype, +) -> torch.Tensor: return input.new_empty(input.size(), dtype=dtype) -@impl(m, "dequantize_per_tensor") +@register_fake("cadence::dequantize_per_tensor") def dequantize_per_tensor_meta( input: torch.Tensor, scale: float, zero_point: int, quant_min: int, quant_max: int, - dtype: ScalarType, -): + dtype: torch.dtype, +) -> torch.Tensor: return input.new_empty(input.size(), dtype=torch.float) -@impl(m, "quantized_linear") +@register_fake("cadence::quantized_linear") def quantized_linear_meta( src: torch.Tensor, weight: torch.Tensor, @@ -102,7 +103,7 @@ def quantized_linear_meta( out_shift: torch.Tensor, out_zero_point: int, offset: Optional[torch.Tensor], -): +) -> torch.Tensor: # src comes in shape [leading_dims, in_dim] # weight comes in shape [out_dim, in_dim] # output comes in empty with shape [leading_dims, out_dim] @@ -113,7 +114,7 @@ def quantized_linear_meta( return src.new_empty(out_size, dtype=torch.uint8) -@impl(m, "quantized_conv") +@register_fake("cadence::quantized_conv") def quantized_conv_meta( input: torch.Tensor, weight: torch.Tensor, @@ -151,7 +152,7 @@ def quantized_conv_meta( return input.new_empty(output_size, dtype=input.dtype) -@impl(m, "quantized_layer_norm") +@register_fake("cadence::quantized_layer_norm") def quantized_layer_norm_meta( input: torch.Tensor, X_scale: torch.Tensor, @@ -162,22 +163,22 @@ def quantized_layer_norm_meta( eps: float, output_scale: float, output_zero_point: int, -): +) -> torch.Tensor: return input.new_empty(input.size(), dtype=torch.uint8) -@impl(m, "quantized_relu") +@register_fake("cadence::quantized_relu") def quantized_relu_meta( X: torch.Tensor, X_zero_point: torch.Tensor, out_zero_point: int, out_multiplier: torch.Tensor, out_shift: torch.Tensor, -): +) -> torch.Tensor: return X.new_empty(X.size(), dtype=torch.uint8) -@impl(m, "quantized_matmul") +@register_fake("cadence::quantized_matmul") def quantized_matmul_meta( X: torch.Tensor, X_zero_point: int, diff --git a/backends/cadence/aot/passes.py b/backends/cadence/aot/passes.py index db419bfb5e1..83ef43d1510 100644 --- a/backends/cadence/aot/passes.py +++ b/backends/cadence/aot/passes.py @@ -266,3 +266,29 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult: result = SpecPropPass()(graph_module) assert result is not None return result + + +class ReplaceSafeSoftmaxWithSoftmax(ExportPass): + """ + Replace _safe_softmax with _softmax + """ + + def call_operator( + self, + op, # pyre-ignore + args: tuple[Argument, ...], + kwargs: dict[str, Argument], + meta: NodeMetadata, + ) -> ProxyValue: + if op != torch.ops.aten._safe_softmax.default: + return super().call_operator(op, args, kwargs, meta) + + # Add False for the half_to_float argument of softmax + softmax_args = list(args) + [False] + + return super().call_operator( + torch.ops.aten._softmax.default, + tuple(softmax_args), + kwargs, + meta, + ) diff --git a/backends/cadence/aot/quantizer/utils.py b/backends/cadence/aot/quantizer/utils.py index 2afe5aba32e..0f9c9399780 100644 --- a/backends/cadence/aot/quantizer/utils.py +++ b/backends/cadence/aot/quantizer/utils.py @@ -145,7 +145,7 @@ def get_aten_node_target_partitions( """ Args: graph: The graph we want to partition - wanted_sources: List of orginal_aten ops (OpOverload) + wanted_original_aten_op: List of original_aten ops (OpOverload) Returns: Dictionary mapping aten ops that were given to a list of SourcePartitions diff --git a/backends/cadence/aot/utils.py b/backends/cadence/aot/utils.py index f0c294260a7..f081036ccc1 100644 --- a/backends/cadence/aot/utils.py +++ b/backends/cadence/aot/utils.py @@ -104,11 +104,11 @@ def get_ops_count(graph_module: torch.fx.GraphModule) -> Dict[str, int]: ): continue # If the op is already present, increment the count - if get_edge_overload_packet(node.target).__name__ in freq: - freq[get_edge_overload_packet(node.target).__name__] += 1 + if node.target._name in freq: + freq[node.target._name] += 1 # else, add a new entry else: - freq[get_edge_overload_packet(node.target).__name__] = 1 + freq[node.target._name] = 1 return freq @@ -177,3 +177,11 @@ def print_ops_info( tablefmt="outline", ) ) + + +def model_gm_has_SDPA(model_gm: torch.fx.GraphModule) -> bool: + for node in model_gm.graph.nodes: + if node.op == "call_function": + if node.target == torch.ops.aten.scaled_dot_product_attention.default: + return True + return False diff --git a/backends/cadence/build_cadence_runner.sh b/backends/cadence/build_cadence_runner.sh index 51f363f8de4..693a320bdf4 100755 --- a/backends/cadence/build_cadence_runner.sh +++ b/backends/cadence/build_cadence_runner.sh @@ -23,7 +23,7 @@ main() { rm -rf cmake-out cmake -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_SDK=ON \ + -DEXECUTORCH_BUILD_DEVTOOLS=ON \ -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ -DPYTHON_EXECUTABLE=python3 \ -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ diff --git a/backends/cadence/build_cadence_xtensa.sh b/backends/cadence/build_cadence_xtensa.sh new file mode 100644 index 00000000000..f96436e65d5 --- /dev/null +++ b/backends/cadence/build_cadence_xtensa.sh @@ -0,0 +1,88 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -euo pipefail + +unset CMAKE_PREFIX_PATH +git submodule sync +git submodule update --init +./install_requirements.sh + +rm -rf cmake-out + +STEPWISE_BUILD=false + +if $STEPWISE_BUILD; then + echo "Building ExecuTorch" + cmake -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_ENABLE_EVENT_TRACER=OFF \ + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ + -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \ + -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \ + -DEXECUTORCH_BUILD_CPUINFO=OFF \ + -DEXECUTORCH_ENABLE_LOGGING=ON \ + -DEXECUTORCH_USE_DL=OFF \ + -DEXECUTORCH_BUILD_CADENCE=OFF \ + -DFLATC_EXECUTABLE="$(which flatc)" \ + -Bcmake-out . + + echo "Building any Cadence-specific binaries on top" + cmake -DBUCK2="$BUCK" \ + -DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_HOST_TARGETS=ON \ + -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \ + -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \ + -DEXECUTORCH_BUILD_CADENCE=ON \ + -DFLATC_EXECUTABLE="$(which flatc)" \ + -DEXECUTORCH_ENABLE_LOGGING=ON \ + -DEXECUTORCH_ENABLE_PROGRAM_VERIFICATION=ON \ + -DEXECUTORCH_USE_DL=OFF \ + -DBUILD_EXECUTORCH_PORTABLE_OPS=ON \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=OFF \ + -DPYTHON_EXECUTABLE=python3 \ + -DEXECUTORCH_NNLIB_OPT=ON \ + -DEXECUTORCH_BUILD_GFLAGS=ON \ + -DHAVE_FNMATCH_H=OFF \ + -Bcmake-out/backends/cadence \ + backends/cadence + cmake --build cmake-out/backends/cadence -j16 +else + echo "Building Cadence toolchain with ExecuTorch packages" + cmake_prefix_path="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags" + cmake -DBUCK2="$BUCK" \ + -DCMAKE_PREFIX_PATH="${cmake_prefix_path}" \ + -DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_HOST_TARGETS=ON \ + -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \ + -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \ + -DEXECUTORCH_BUILD_CADENCE=OFF \ + -DFLATC_EXECUTABLE="$(which flatc)" \ + -DEXECUTORCH_ENABLE_LOGGING=ON \ + -DEXECUTORCH_ENABLE_PROGRAM_VERIFICATION=ON \ + -DEXECUTORCH_USE_DL=OFF \ + -DBUILD_EXECUTORCH_PORTABLE_OPS=ON \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=OFF \ + -DPYTHON_EXECUTABLE=python3 \ + -DEXECUTORCH_NNLIB_OPT=ON \ + -DEXECUTORCH_BUILD_GFLAGS=ON \ + -DHAVE_FNMATCH_H=OFF \ + -DEXECUTORCH_ENABLE_EVENT_TRACER=OFF \ + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ + -DEXECUTORCH_BUILD_CPUINFO=OFF \ + -Bcmake-out + cmake --build cmake-out --target install --config Release -j16 +fi + +echo "Run simple model to verify cmake build" +python3 -m examples.portable.scripts.export --model_name="add" +xt-run --turbo cmake-out/executor_runner --model_path=add.pte diff --git a/backends/cadence/cadence_runner/cadence_runner.cpp b/backends/cadence/cadence_runner/cadence_runner.cpp index d76ba004aae..a269ed5a8e8 100644 --- a/backends/cadence/cadence_runner/cadence_runner.cpp +++ b/backends/cadence/cadence_runner/cadence_runner.cpp @@ -22,13 +22,13 @@ #include +#include +#include #include #include #include #include #include -#include -#include static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4MB diff --git a/backends/cadence/cadence_runner/targets.bzl b/backends/cadence/cadence_runner/targets.bzl index 028ff7ad2ef..b59a98cd75a 100644 --- a/backends/cadence/cadence_runner/targets.bzl +++ b/backends/cadence/cadence_runner/targets.bzl @@ -19,12 +19,11 @@ def define_common_targets(): visibility = ["PUBLIC"], deps = [ "fbsource//arvr/third-party/gflags:gflags", - "fbsource//xplat/executorch/kernels/portable:generated_lib", - "fbsource//xplat/executorch/runtime/executor:program", + "fbsource//xplat/executorch/devtools/etdump:etdump_flatcc", + "fbsource//xplat/executorch/devtools/bundled_program:runtime", "fbsource//xplat/executorch/extension/data_loader:file_data_loader", "fbsource//xplat/executorch/extension/data_loader:buffer_data_loader", - "fbsource//xplat/executorch/util:util", - "fbsource//xplat/executorch/sdk/etdump:etdump_flatcc", - "fbsource//xplat/executorch/sdk/bundled_program:runtime", + "fbsource//xplat/executorch/kernels/portable:generated_lib", + "fbsource//xplat/executorch/runtime/executor:program", ], ) diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt index 8cbeb3e1806..15d1a4ddd52 100644 --- a/backends/cadence/hifi/kernels/CMakeLists.txt +++ b/backends/cadence/hifi/kernels/CMakeLists.txt @@ -13,11 +13,12 @@ add_library( target_include_directories( cadence_kernels - PUBLIC . - ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/algo/common/include/ - ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/include/nnlib - ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/include - ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/algo/ndsp/hifi4/include/ + PUBLIC + . + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/algo/common/include/ + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/include/nnlib + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/include + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/algo/ndsp/hifi4/include/ ) target_link_libraries(cadence_kernels PRIVATE xa_nnlib) diff --git a/backends/cadence/hifi/kernels/TARGETS b/backends/cadence/hifi/kernels/TARGETS new file mode 100644 index 00000000000..67f2bab681a --- /dev/null +++ b/backends/cadence/hifi/kernels/TARGETS @@ -0,0 +1,5 @@ +load("targets.bzl", "define_common_targets") + +oncall("odai_jarvis") + +define_common_targets() diff --git a/backends/cadence/hifi/kernels/kernels.cpp b/backends/cadence/hifi/kernels/kernels.cpp index 5a2d58d2e2f..4d9183e4cc2 100644 --- a/backends/cadence/hifi/kernels/kernels.cpp +++ b/backends/cadence/hifi/kernels/kernels.cpp @@ -6,9 +6,9 @@ * LICENSE file in the root directory of this source tree. */ -#include -#include "xa_nnlib_common.h" -#include "xa_nnlib_common_macros.h" +#include +#include +#include namespace impl { namespace HiFi { diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h index 789c8942a85..b5659824615 100644 --- a/backends/cadence/hifi/kernels/kernels.h +++ b/backends/cadence/hifi/kernels/kernels.h @@ -8,12 +8,9 @@ #pragma once -#include "inttypes.h" -#include "stddef.h" -#include "xa_type_def.h" - -/* For NNLIB APIs */ -#include "xa_nnlib_kernels_api.h" +#include +#include +#include namespace impl { namespace HiFi { diff --git a/backends/cadence/hifi/kernels/targets.bzl b/backends/cadence/hifi/kernels/targets.bzl new file mode 100644 index 00000000000..acdc39dd16d --- /dev/null +++ b/backends/cadence/hifi/kernels/targets.bzl @@ -0,0 +1,18 @@ +load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX") +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + runtime.cxx_library( + name = "kernels", + srcs = ["kernels.cpp"], + exported_headers = [ + "kernels.h", + ], + visibility = [ + "//executorch/backends/cadence/...", + ], + exported_deps = [ + "fbsource//third-party/nnlib-hifi4/xa_nnlib:libxa_nnlib_common", + ], + platforms = CXX, + ) diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt index 996d109db48..8da6169cda1 100644 --- a/backends/cadence/hifi/operators/CMakeLists.txt +++ b/backends/cadence/hifi/operators/CMakeLists.txt @@ -44,7 +44,8 @@ set(_aten_ops__srcs "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sub.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_to_copy.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_view_copy.cpp" - "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_where.cpp") + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_where.cpp" +) add_library(aten_ops_cadence ${_aten_ops__srcs}) target_link_libraries(aten_ops_cadence PUBLIC executorch) target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels) @@ -52,18 +53,20 @@ target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels) # Let files say "include ". set(_common_include_directories ${EXECUTORCH_ROOT}/..) -target_include_directories(aten_ops_cadence PUBLIC ${ROOT_DIR}/.. - ${CMAKE_BINARY_DIR} - ${_common_include_directories}) +target_include_directories( + aten_ops_cadence PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR} + ${_common_include_directories} +) # Custom ops that are needed to run the test model. add_library( - custom_ops "quantized_linear_out.cpp" - "quantized_layer_norm.cpp" - "quantize_per_tensor.cpp" "dequantize_per_tensor.cpp") -target_include_directories(custom_ops PUBLIC ${ROOT_DIR}/.. - ${CMAKE_BINARY_DIR} - ${_common_include_directories}) + custom_ops "quantized_linear_out.cpp" "quantized_layer_norm.cpp" + "quantize_per_tensor.cpp" "dequantize_per_tensor.cpp" +) +target_include_directories( + custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR} + ${_common_include_directories} +) target_link_libraries(custom_ops PUBLIC executorch) target_link_libraries(custom_ops PRIVATE cadence_kernels) @@ -75,12 +78,11 @@ gen_selected_ops( "${CMAKE_CURRENT_LIST_DIR}/../../aot/functions_hifi.yaml" "" "" ) generate_bindings_for_kernels( - LIB_NAME "cadence_ops_lib" OPS_SCHEMA_YAML - FUNCTIONS_YAML ${CMAKE_CURRENT_SOURCE_DIR}/../../aot/functions_hifi.yaml + LIB_NAME "cadence_ops_lib" OPS_SCHEMA_YAML FUNCTIONS_YAML + ${CMAKE_CURRENT_SOURCE_DIR}/../../aot/functions_hifi.yaml ) message("Generated files ${gen_command_sources}") gen_operators_lib( - LIB_NAME "cadence_ops_lib" - KERNEL_LIBS custom_ops - DEPS aten_ops_cadence) + LIB_NAME "cadence_ops_lib" KERNEL_LIBS custom_ops DEPS aten_ops_cadence +) diff --git a/backends/cadence/hifi/operators/TARGETS b/backends/cadence/hifi/operators/TARGETS new file mode 100644 index 00000000000..67f2bab681a --- /dev/null +++ b/backends/cadence/hifi/operators/TARGETS @@ -0,0 +1,5 @@ +load("targets.bzl", "define_common_targets") + +oncall("odai_jarvis") + +define_common_targets() diff --git a/backends/cadence/hifi/operators/dequantize_per_tensor.cpp b/backends/cadence/hifi/operators/dequantize_per_tensor.cpp index 0067f6510db..79645f5381d 100644 --- a/backends/cadence/hifi/operators/dequantize_per_tensor.cpp +++ b/backends/cadence/hifi/operators/dequantize_per_tensor.cpp @@ -6,19 +6,20 @@ * LICENSE file in the root directory of this source tree. */ -#include +#include #include +#include namespace impl { namespace HiFi { namespace native { using Tensor = exec_aten::Tensor; -using RuntimeContext = torch::executor::RuntimeContext; +using executorch::runtime::KernelRuntimeContext; using ScalarType = exec_aten::ScalarType; void dequantize_per_tensor_out( - RuntimeContext& context, + KernelRuntimeContext& context, const Tensor& input, double scale, int64_t zero_point, diff --git a/backends/cadence/hifi/operators/quantize_per_tensor.cpp b/backends/cadence/hifi/operators/quantize_per_tensor.cpp index bc0d315f3dd..e280f6bcffd 100644 --- a/backends/cadence/hifi/operators/quantize_per_tensor.cpp +++ b/backends/cadence/hifi/operators/quantize_per_tensor.cpp @@ -6,21 +6,22 @@ * LICENSE file in the root directory of this source tree. */ -#include +#include #include +#include namespace impl { namespace HiFi { namespace native { using Tensor = exec_aten::Tensor; -using RuntimeContext = torch::executor::RuntimeContext; +using executorch::runtime::KernelRuntimeContext; using ScalarType = exec_aten::ScalarType; // Quantize the input tensor (PT2 version). Note that quant_ are not // used in any computation. void quantize_per_tensor_out( - RuntimeContext& context, + KernelRuntimeContext& context, const Tensor& input, double scale, int64_t zero_point, diff --git a/backends/cadence/hifi/operators/quantized_layer_norm.cpp b/backends/cadence/hifi/operators/quantized_layer_norm.cpp index 034e5b28848..3974d6ee5e9 100644 --- a/backends/cadence/hifi/operators/quantized_layer_norm.cpp +++ b/backends/cadence/hifi/operators/quantized_layer_norm.cpp @@ -6,15 +6,14 @@ * LICENSE file in the root directory of this source tree. */ -#include +#include #include - #include #include #include using Tensor = exec_aten::Tensor; -using RuntimeContext = torch::executor::RuntimeContext; +using executorch::runtime::KernelRuntimeContext; namespace impl { namespace HiFi { @@ -76,9 +75,11 @@ void quantized_layer_norm_( for (size_t j = 0; j < last_dim; ++j) { // Since X is quantized, we dequantize it, compute fp32 result, and // quantize the result to an int8/uint8 value. - float val = kernels::dequantize(x[j], input_scale, input_zero_point); + float val = impl::HiFi::kernels::dequantize( + x[j], input_scale, input_zero_point); val = (val - mean) * inv_std * weight_data[j] + bias_data[j]; - y[j] = kernels::quantize(val, output_inv_scale, output_zero_point); + y[j] = impl::HiFi::kernels::quantize( + val, output_inv_scale, output_zero_point); } } } @@ -114,7 +115,7 @@ void quantized_layer_norm_( } void quantized_layer_norm_out( - RuntimeContext& ctx, + KernelRuntimeContext& ctx, const Tensor& input, const Tensor& in_scale, const Tensor& in_zero_point, diff --git a/backends/cadence/hifi/operators/quantized_linear_out.cpp b/backends/cadence/hifi/operators/quantized_linear_out.cpp index ddba4df17c2..fb186abbb14 100644 --- a/backends/cadence/hifi/operators/quantized_linear_out.cpp +++ b/backends/cadence/hifi/operators/quantized_linear_out.cpp @@ -6,8 +6,7 @@ * LICENSE file in the root directory of this source tree. */ -#include - +#include #include #include #include @@ -17,10 +16,10 @@ namespace HiFi { namespace native { using Tensor = exec_aten::Tensor; -using RuntimeContext = torch::executor::RuntimeContext; +using executorch::runtime::KernelRuntimeContext; void quantized_linear_out( - RuntimeContext& ctx, + KernelRuntimeContext& ctx, const Tensor& src, const Tensor& weight, const Tensor& bias, diff --git a/backends/cadence/hifi/operators/targets.bzl b/backends/cadence/hifi/operators/targets.bzl new file mode 100644 index 00000000000..c7b24d790f0 --- /dev/null +++ b/backends/cadence/hifi/operators/targets.bzl @@ -0,0 +1,30 @@ +load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX") +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + """Defines targets that should be shared between fbcode and xplat. + + The directory containing this targets.bzl file should also contain both + TARGETS and BUCK files that call this function. + """ + + # Define build targets for all operators registered in the tables above. + + runtime.cxx_library( + name = "cadence_hifi_ops", + srcs = glob([ + "*.cpp", + ]), + platforms = CXX, + deps = [ + "//executorch/kernels/portable/cpu/util:broadcast_util", + "//executorch/runtime/kernel:kernel_includes", + "//executorch/kernels/portable/cpu:scalar_utils", + "fbsource//third-party/nnlib-hifi4/xa_nnlib:libxa_nnlib", + "fbsource//third-party/nnlib-hifi4/xa_nnlib:libxa_nnlib_common", + "//executorch/backends/cadence/hifi/kernels:kernels", + ], + visibility = [ + "//executorch/backends/cadence/...", + ], + ) diff --git a/backends/cadence/hifi/third-party/nnlib/CMakeLists.txt b/backends/cadence/hifi/third-party/nnlib/CMakeLists.txt index e93e0759d2c..90eca6b47e1 100644 --- a/backends/cadence/hifi/third-party/nnlib/CMakeLists.txt +++ b/backends/cadence/hifi/third-party/nnlib/CMakeLists.txt @@ -1,30 +1,19 @@ - cmake_minimum_required(VERSION 3.10.0) project(cadence_nnlib) - -add_custom_target( nnlib_target ALL COMMAND - make install_nnlib -f makefile -C ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/build - OBJDIR=${CMAKE_CURRENT_BINARY_DIR}/obj - LIBDIR=${CMAKE_CURRENT_BINARY_DIR}/lib - -j8 ) +add_custom_target( + nnlib_target ALL + COMMAND + make install_nnlib -f makefile -C + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/build + OBJDIR=${CMAKE_CURRENT_BINARY_DIR}/obj + LIBDIR=${CMAKE_CURRENT_BINARY_DIR}/lib -j8 +) add_library(xa_nnlib STATIC IMPORTED GLOBAL) add_dependencies(xa_nnlib nnlib_target) set_property( - TARGET xa_nnlib - PROPERTY - IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/lib/xa_nnlib.a" + TARGET xa_nnlib PROPERTY IMPORTED_LOCATION + "${CMAKE_CURRENT_BINARY_DIR}/lib/xa_nnlib.a" ) - - - - - - - - - - - diff --git a/backends/cadence/reference/kernels/CMakeLists.txt b/backends/cadence/reference/kernels/CMakeLists.txt index eadb01f54d5..fba66e9b27a 100644 --- a/backends/cadence/reference/kernels/CMakeLists.txt +++ b/backends/cadence/reference/kernels/CMakeLists.txt @@ -5,12 +5,6 @@ # LICENSE file in the root directory of this source tree. # lint_cmake: -linelength -add_library( - cadence_kernels - kernels.cpp -) +add_library(cadence_kernels kernels.cpp) -target_include_directories( - cadence_kernels - PUBLIC . -) +target_include_directories(cadence_kernels PUBLIC .) diff --git a/backends/cadence/reference/operators/CMakeLists.txt b/backends/cadence/reference/operators/CMakeLists.txt index 71b0304c997..605c43ef715 100644 --- a/backends/cadence/reference/operators/CMakeLists.txt +++ b/backends/cadence/reference/operators/CMakeLists.txt @@ -50,7 +50,8 @@ set(_aten_ops__srcs "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_where.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_expand_copy.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_gelu.cpp" - "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_empty.cpp") + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_empty.cpp" +) add_library(aten_ops_cadence ${_aten_ops__srcs}) target_link_libraries(aten_ops_cadence PUBLIC executorch) target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels) @@ -58,19 +59,26 @@ target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels) # Let files say "include ". set(_common_include_directories ${EXECUTORCH_ROOT}/..) -target_include_directories(aten_ops_cadence PUBLIC ${ROOT_DIR}/.. - ${CMAKE_BINARY_DIR} - ${_common_include_directories}) +target_include_directories( + aten_ops_cadence PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR} + ${_common_include_directories} +) # Custom ops that are needed to run the test model. add_library( - custom_ops "quantized_linear_out.cpp" "quantized_conv_out.cpp" - "quantized_relu_out.cpp" "quantized_layer_norm.cpp" - "quantize_per_tensor.cpp" "dequantize_per_tensor.cpp" - "quantized_matmul_out.cpp") -target_include_directories(custom_ops PUBLIC ${ROOT_DIR}/.. - ${CMAKE_BINARY_DIR} - ${_common_include_directories}) + custom_ops + "quantized_linear_out.cpp" + "quantized_conv_out.cpp" + "quantized_relu_out.cpp" + "quantized_layer_norm.cpp" + "quantize_per_tensor.cpp" + "dequantize_per_tensor.cpp" + "quantized_matmul_out.cpp" +) +target_include_directories( + custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR} + ${_common_include_directories} +) target_link_libraries(custom_ops PUBLIC executorch) target_link_libraries(custom_ops PRIVATE cadence_kernels) @@ -82,12 +90,11 @@ gen_selected_ops( "${CMAKE_CURRENT_LIST_DIR}/../../aot/functions.yaml" "" "" ) generate_bindings_for_kernels( - LIB_NAME "cadence_ops_lib" OPS_SCHEMA_YAML - FUNCTIONS_YAML ${CMAKE_CURRENT_SOURCE_DIR}/../../aot/functions.yaml + LIB_NAME "cadence_ops_lib" OPS_SCHEMA_YAML FUNCTIONS_YAML + ${CMAKE_CURRENT_SOURCE_DIR}/../../aot/functions.yaml ) message("Generated cadence x86 files ${gen_command_sources}") gen_operators_lib( - LIB_NAME "cadence_ops_lib" - KERNEL_LIBS custom_ops - DEPS aten_ops_cadence) + LIB_NAME "cadence_ops_lib" KERNEL_LIBS custom_ops DEPS aten_ops_cadence +) diff --git a/backends/cadence/reference/operators/dequantize_per_tensor.cpp b/backends/cadence/reference/operators/dequantize_per_tensor.cpp index 29323ce612f..9c6cf6ecc55 100644 --- a/backends/cadence/reference/operators/dequantize_per_tensor.cpp +++ b/backends/cadence/reference/operators/dequantize_per_tensor.cpp @@ -14,11 +14,11 @@ namespace reference { namespace native { using Tensor = exec_aten::Tensor; -using RuntimeContext = torch::executor::RuntimeContext; +using executorch::runtime::KernelRuntimeContext; using ScalarType = exec_aten::ScalarType; void dequantize_per_tensor_out( - RuntimeContext& context, + KernelRuntimeContext& context, const Tensor& input, double scale, int64_t zero_point, diff --git a/backends/cadence/reference/operators/op_add.cpp b/backends/cadence/reference/operators/op_add.cpp index 3a8a3887171..89b67467605 100644 --- a/backends/cadence/reference/operators/op_add.cpp +++ b/backends/cadence/reference/operators/op_add.cpp @@ -16,7 +16,7 @@ namespace executor { namespace native { Tensor& add_out( - RuntimeContext& ctx, + KernelRuntimeContext& ctx, const Tensor& a, const Tensor& b, const Scalar& alpha, diff --git a/backends/cadence/reference/operators/op_embedding.cpp b/backends/cadence/reference/operators/op_embedding.cpp index f0b625c963e..e1e4984b56e 100644 --- a/backends/cadence/reference/operators/op_embedding.cpp +++ b/backends/cadence/reference/operators/op_embedding.cpp @@ -13,10 +13,10 @@ namespace executor { namespace native { using Tensor = exec_aten::Tensor; -using RuntimeContext = torch::executor::RuntimeContext; +using executorch::runtime::KernelRuntimeContext; void embedding_out( - RuntimeContext& ctx, + KernelRuntimeContext& ctx, const Tensor& weight, const Tensor& indices, int64_t padding_idx, diff --git a/backends/cadence/reference/operators/op_full.cpp b/backends/cadence/reference/operators/op_full.cpp index 75d1d51901a..00be1889651 100644 --- a/backends/cadence/reference/operators/op_full.cpp +++ b/backends/cadence/reference/operators/op_full.cpp @@ -17,7 +17,7 @@ using Tensor = exec_aten::Tensor; using ScalarType = exec_aten::ScalarType; Tensor& full_out( - RuntimeContext& ctx, + KernelRuntimeContext& ctx, const IntArrayRef sizes, const Scalar& fill_value, Tensor& out) { diff --git a/backends/cadence/reference/operators/op_view_copy.cpp b/backends/cadence/reference/operators/op_view_copy.cpp index a363125c375..ac0a8598499 100644 --- a/backends/cadence/reference/operators/op_view_copy.cpp +++ b/backends/cadence/reference/operators/op_view_copy.cpp @@ -13,10 +13,10 @@ namespace executor { namespace native { using Tensor = exec_aten::Tensor; -using RuntimeContext = torch::executor::RuntimeContext; +using executorch::runtime::KernelRuntimeContext; Tensor& view_copy_out( - RuntimeContext& ctx, + KernelRuntimeContext& ctx, const Tensor& input, const IntArrayRef size, Tensor& out) { diff --git a/backends/cadence/reference/operators/quantize_per_tensor.cpp b/backends/cadence/reference/operators/quantize_per_tensor.cpp index c2e53cda885..bc200fd376e 100644 --- a/backends/cadence/reference/operators/quantize_per_tensor.cpp +++ b/backends/cadence/reference/operators/quantize_per_tensor.cpp @@ -14,13 +14,13 @@ namespace reference { namespace native { using Tensor = exec_aten::Tensor; -using RuntimeContext = torch::executor::RuntimeContext; +using executorch::runtime::KernelRuntimeContext; using ScalarType = exec_aten::ScalarType; // Quantize the input tensor (PT2 version). Note that quant_ are not // used in any computation. void quantize_per_tensor_out( - RuntimeContext& context, + KernelRuntimeContext& context, const Tensor& input, double scale, int64_t zero_point, diff --git a/backends/cadence/reference/operators/quantized_conv_out.cpp b/backends/cadence/reference/operators/quantized_conv_out.cpp index 4bb7b12a887..47234a7cd95 100644 --- a/backends/cadence/reference/operators/quantized_conv_out.cpp +++ b/backends/cadence/reference/operators/quantized_conv_out.cpp @@ -17,7 +17,7 @@ namespace reference { namespace native { using Tensor = exec_aten::Tensor; -using RuntimeContext = torch::executor::RuntimeContext; +using executorch::runtime::KernelRuntimeContext; // This implements a generic 2d conv kernel that operates on raw pointers. // The version handles both quantized and fp32 convolutions. @@ -156,7 +156,7 @@ __attribute__((noinline)) void conv2d_nchw_core_generic( // quantized::conv1d or quantized::conv2d based on the dimensionality of // activation tensor. void quantized_conv_out( - RuntimeContext& ctx, + KernelRuntimeContext& ctx, const Tensor& input, const Tensor& weight, const Tensor& bias, diff --git a/backends/cadence/reference/operators/quantized_layer_norm.cpp b/backends/cadence/reference/operators/quantized_layer_norm.cpp index 6588748d2da..a2dd644a976 100644 --- a/backends/cadence/reference/operators/quantized_layer_norm.cpp +++ b/backends/cadence/reference/operators/quantized_layer_norm.cpp @@ -14,7 +14,7 @@ #include using Tensor = exec_aten::Tensor; -using RuntimeContext = torch::executor::RuntimeContext; +using executorch::runtime::KernelRuntimeContext; namespace impl { namespace reference { @@ -112,7 +112,7 @@ void quantized_layer_norm_( } void quantized_layer_norm_out( - RuntimeContext& ctx, + KernelRuntimeContext& ctx, const Tensor& input, const Tensor& in_scale, const Tensor& in_zero_point, diff --git a/backends/cadence/reference/operators/quantized_linear_out.cpp b/backends/cadence/reference/operators/quantized_linear_out.cpp index 43289b3a28b..300158d8e5e 100644 --- a/backends/cadence/reference/operators/quantized_linear_out.cpp +++ b/backends/cadence/reference/operators/quantized_linear_out.cpp @@ -14,10 +14,10 @@ namespace reference { namespace native { using Tensor = exec_aten::Tensor; -using RuntimeContext = torch::executor::RuntimeContext; +using executorch::runtime::KernelRuntimeContext; void quantized_linear_out( - RuntimeContext& ctx, + KernelRuntimeContext& ctx, const Tensor& src, const Tensor& weight, const Tensor& bias, diff --git a/backends/cadence/reference/operators/quantized_matmul_out.cpp b/backends/cadence/reference/operators/quantized_matmul_out.cpp index d65175f8f17..b381a8ee394 100644 --- a/backends/cadence/reference/operators/quantized_matmul_out.cpp +++ b/backends/cadence/reference/operators/quantized_matmul_out.cpp @@ -14,7 +14,7 @@ namespace reference { namespace native { using Tensor = exec_aten::Tensor; -using RuntimeContext = torch::executor::RuntimeContext; +using executorch::runtime::KernelRuntimeContext; // The quantized matmul. The quantized matmul accumulates in a wider register, // whose type is TA. @@ -108,7 +108,7 @@ void inline _typed_quantized_matmul( } void quantized_matmul_out( - RuntimeContext& ctx, + KernelRuntimeContext& ctx, const Tensor& X, int64_t X_zero_point, const Tensor& Y, diff --git a/backends/cadence/reference/operators/quantized_relu_out.cpp b/backends/cadence/reference/operators/quantized_relu_out.cpp index ef1813f65c7..04cb2c88336 100644 --- a/backends/cadence/reference/operators/quantized_relu_out.cpp +++ b/backends/cadence/reference/operators/quantized_relu_out.cpp @@ -14,7 +14,7 @@ namespace reference { namespace native { using Tensor = exec_aten::Tensor; -using RuntimeContext = torch::executor::RuntimeContext; +using executorch::runtime::KernelRuntimeContext; template void quantized_relu_( @@ -44,7 +44,7 @@ void quantized_relu_( } void quantized_relu_out( - RuntimeContext& ctx, + KernelRuntimeContext& ctx, const Tensor& input, const Tensor& in_zero_point, const int64_t out_zero_point, diff --git a/backends/cadence/runtime/TARGETS b/backends/cadence/runtime/TARGETS index 9f30cadf6fd..1b55a7d541b 100644 --- a/backends/cadence/runtime/TARGETS +++ b/backends/cadence/runtime/TARGETS @@ -13,9 +13,9 @@ python_library( typing = True, deps = [ "//caffe2:torch", + "//executorch/devtools/bundled_program:config", + "//executorch/devtools/bundled_program:core", + "//executorch/devtools/bundled_program/serialize:lib", "//executorch/exir:lib", - "//executorch/sdk/bundled_program:config", - "//executorch/sdk/bundled_program:core", - "//executorch/sdk/bundled_program/serialize:lib", ], ) diff --git a/backends/cadence/runtime/executor.py b/backends/cadence/runtime/executor.py index 7bcf705c034..d07b1b6a52e 100644 --- a/backends/cadence/runtime/executor.py +++ b/backends/cadence/runtime/executor.py @@ -18,14 +18,13 @@ import torch -from executorch.exir import ExecutorchProgram, ExecutorchProgramManager - -from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite -from executorch.sdk.bundled_program.core import BundledProgram +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.core import BundledProgram -from executorch.sdk.bundled_program.serialize import ( +from executorch.devtools.bundled_program.serialize import ( serialize_from_bundled_program_to_flatbuffer, ) +from executorch.exir import ExecutorchProgram, ExecutorchProgramManager # If quiet is true, suppress the printing of stdout and stderr output. quiet = False diff --git a/backends/cadence/runtime/executor_main.sh b/backends/cadence/runtime/executor_main.sh index c850ab8b4a9..7d6cba09b87 100644 --- a/backends/cadence/runtime/executor_main.sh +++ b/backends/cadence/runtime/executor_main.sh @@ -5,7 +5,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -# Test the end-to-end flow of building sdk_example_runner and use it to run +# Test the end-to-end flow of building devtools/example_runner and use it to run # an actual model. @@ -14,21 +14,21 @@ set -e # shellcheck source=/dev/null source "$(dirname "${BASH_SOURCE[0]}")/../../.ci/scripts/utils.sh" -cmake_install_executorch_sdk_lib() { +cmake_install_executorch_devtools_lib() { echo "Installing libexecutorch.a, libportable_kernels.a, libetdump.a, libbundled_program.a" rm -rf cmake-out retry cmake -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_SDK=ON \ + -DEXECUTORCH_BUILD_DEVTOOLS=ON \ -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ -Bcmake-out . cmake --build cmake-out -j9 --target install --config Release } -test_cmake_sdk_example_runner() { - local example_dir=examples/sdk +test_cmake_devtools_example_runner() { + local example_dir=examples/devtools local build_dir=cmake-out/${example_dir} CMAKE_PREFIX_PATH="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags" rm -rf ${build_dir} @@ -42,8 +42,8 @@ test_cmake_sdk_example_runner() { echo "Building ${example_dir}" cmake --build ${build_dir} -j9 --config Release - echo 'Running sdk_example_runner' - ${build_dir}/sdk_example_runner --bundled_program_path="./CadenceDemoModel.bpte" + echo 'Running devtools/example_runner' + ${build_dir}/example_runner --bundled_program_path="./CadenceDemoModel.bpte" } if [[ -z $PYTHON_EXECUTABLE ]]; @@ -56,5 +56,5 @@ then BUCK=buck2 fi -cmake_install_executorch_sdk_lib -test_cmake_sdk_example_runner +cmake_install_executorch_devtools_lib +test_cmake_devtools_example_runner diff --git a/backends/cadence/runtime/runtime.py b/backends/cadence/runtime/runtime.py index ec282f8f7b3..33bb20719c8 100644 --- a/backends/cadence/runtime/runtime.py +++ b/backends/cadence/runtime/runtime.py @@ -18,10 +18,10 @@ from executorch.backends.cadence.runtime import utils from executorch.backends.cadence.runtime.executor import Executor +from executorch.devtools import Inspector from executorch.exir import ExecutorchProgramManager from executorch.exir._serialize._program import deserialize_pte_binary from executorch.exir.schema import DataLocation -from executorch.sdk import Inspector from numpy import ndarray diff --git a/backends/example/test_example_delegate.py b/backends/example/test_example_delegate.py index 973b457bade..d830c1bb312 100644 --- a/backends/example/test_example_delegate.py +++ b/backends/example/test_example_delegate.py @@ -46,7 +46,7 @@ def get_example_inputs(): ) m = model.eval() - m = torch._export.capture_pre_autograd_graph(m, copy.deepcopy(example_inputs)) + m = torch.export.export_for_training(m, copy.deepcopy(example_inputs)).module() # print("original model:", m) quantizer = ExampleQuantizer() # quantizer = XNNPACKQuantizer() @@ -82,7 +82,7 @@ def test_delegate_mobilenet_v2(self): ) m = model.eval() - m = torch._export.capture_pre_autograd_graph(m, copy.deepcopy(example_inputs)) + m = torch.export.export_for_training(m, copy.deepcopy(example_inputs)).module() quantizer = ExampleQuantizer() m = prepare_pt2e(m, quantizer) diff --git a/backends/mediatek/CMakeLists.txt b/backends/mediatek/CMakeLists.txt index c7de8bb1f04..744b1193d5a 100644 --- a/backends/mediatek/CMakeLists.txt +++ b/backends/mediatek/CMakeLists.txt @@ -10,40 +10,39 @@ # Let include directory as "executorch/..." set(_common_include_directories ${CMAKE_CURRENT_SOURCE_DIR}/../../..) -set(NEURON_BUFFER_ALLOCATOR_LIB "" CACHE PATH "Path to Neuron Buffer Allocator library") -message(STATUS "Looking for neuron_buffer_allocator in ${NEURON_BUFFER_ALLOCATOR_LIB}") - -include_directories( - BEFORE - ${_common_include_directories} +set(NEURON_BUFFER_ALLOCATOR_LIB + "" + CACHE PATH "Path to Neuron Buffer Allocator library" +) +message( + STATUS "Looking for neuron_buffer_allocator in ${NEURON_BUFFER_ALLOCATOR_LIB}" ) +include_directories(BEFORE ${_common_include_directories}) + # shortcut include directory for neuron headers -include_directories( - BEFORE - ${CMAKE_CURRENT_SOURCE_DIR}/runtime/include -) +include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/runtime/include) # targets add_library(neuron_backend SHARED) target_link_libraries(neuron_backend PRIVATE executorch_no_prim_ops + portable_ops_lib android log ${NEURON_BUFFER_ALLOCATOR_LIB} ) -target_sources(neuron_backend - INTERFACE - ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronBackend.h - ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronBufferAllocator.h - ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronExecutor.h - ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronLog.h - ${CMAKE_CURRENT_LIST_DIR}/runtime/include/api/APUWareUtilsLib.h - ${CMAKE_CURRENT_LIST_DIR}/runtime/include/api/NeuronAdapterShim.h - PRIVATE - ${CMAKE_CURRENT_LIST_DIR}/runtime/NeuronBackend.cpp - ${CMAKE_CURRENT_LIST_DIR}/runtime/NeuronExecutor.cpp +target_sources( + neuron_backend + INTERFACE ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronBackend.h + ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronBufferAllocator.h + ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronExecutor.h + ${CMAKE_CURRENT_LIST_DIR}/runtime/include/NeuronLog.h + ${CMAKE_CURRENT_LIST_DIR}/runtime/include/api/APUWareUtilsLib.h + ${CMAKE_CURRENT_LIST_DIR}/runtime/include/api/NeuronAdapterShim.h + PRIVATE ${CMAKE_CURRENT_LIST_DIR}/runtime/NeuronBackend.cpp + ${CMAKE_CURRENT_LIST_DIR}/runtime/NeuronExecutor.cpp ) target_link_options_shared_lib(neuron_backend) diff --git a/backends/mediatek/runtime/include/NeuronBackend.h b/backends/mediatek/runtime/include/NeuronBackend.h index 2cfcb311b93..7a22956de63 100644 --- a/backends/mediatek/runtime/include/NeuronBackend.h +++ b/backends/mediatek/runtime/include/NeuronBackend.h @@ -26,7 +26,7 @@ namespace torch { namespace executor { -class NeuronBackend final : public PyTorchBackendInterface { +class NeuronBackend final : public ::executorch::runtime::BackendInterface { public: Result init( BackendInitContext& context, diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt index babdb96d8bc..a8265df8c7b 100644 --- a/backends/qualcomm/CMakeLists.txt +++ b/backends/qualcomm/CMakeLists.txt @@ -66,9 +66,7 @@ if(CMAKE_BUILD_TYPE STREQUAL "Release") add_link_options("-s") # --gc-sections is added by torch. - add_compile_options( - "-O3" "-ffunction-sections" "-fdata-sections" "-frtti" - ) + add_compile_options("-O3" "-ffunction-sections" "-fdata-sections" "-frtti") endif() include_directories( @@ -183,7 +181,10 @@ target_link_libraries( ) target_link_libraries( qnn_executorch_backend PRIVATE qnn_executorch_header qnn_schema qnn_manager - executorch_no_prim_ops qcir_utils + executorch_no_prim_ops qcir_utils extension_tensor +) +set_target_properties( + qnn_executorch_backend PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'" ) target_link_libraries(utils PRIVATE qnn_executorch_logging) target_link_libraries( @@ -245,6 +246,7 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64") qnn_executorch_header executorch qcir_utils + extension_tensor ) target_link_libraries( PyQnnWrapperAdaptor PRIVATE pybind11::module pybind11::lto wrappers @@ -261,11 +263,8 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64") if(CMAKE_BUILD_TYPE STREQUAL "Release") # need to allow exceptions in pybind - set(_pybind_compile_options - -Wno-deprecated-declarations - -fPIC - -frtti - -fexceptions + set(_pybind_compile_options -Wno-deprecated-declarations -fPIC -frtti + -fexceptions ) target_compile_options( PyQnnManagerAdaptor PUBLIC ${_pybind_compile_options} diff --git a/backends/qualcomm/builders/__init__.py b/backends/qualcomm/builders/__init__.py index d3bf98bae72..79c02e22072 100644 --- a/backends/qualcomm/builders/__init__.py +++ b/backends/qualcomm/builders/__init__.py @@ -38,6 +38,7 @@ op_quantize, op_relu, op_reshape, + op_rms_norm, op_rsqrt, op_select_copy, op_sigmoid, @@ -92,6 +93,7 @@ op_quantize, op_relu, op_reshape, + op_rms_norm, op_rsqrt, op_select_copy, op_sigmoid, diff --git a/backends/qualcomm/builders/node_visitor.py b/backends/qualcomm/builders/node_visitor.py index e07a745df5f..514bc6efd78 100644 --- a/backends/qualcomm/builders/node_visitor.py +++ b/backends/qualcomm/builders/node_visitor.py @@ -202,7 +202,7 @@ def get_quant_tensor_value( dtype = quant_configs[QCOM_DTYPE] - tensor = tensor.div(scale + 1e-6).add(zero_point).round().to(dtype) + tensor = tensor.div(scale).add(zero_point).round().to(dtype) # Make the backends access data correctly if quant_configs.get(QCOM_BITWIDTH) == 4: mask = torch.full(tensor.size(), 0x0F, dtype=torch.int8) diff --git a/backends/qualcomm/builders/op_batch_norm.py b/backends/qualcomm/builders/op_batch_norm.py index 13b24c0d722..9ca299e7432 100644 --- a/backends/qualcomm/builders/op_batch_norm.py +++ b/backends/qualcomm/builders/op_batch_norm.py @@ -8,6 +8,11 @@ import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper import torch +from executorch.backends.qualcomm.utils.constants import ( + QCOM_QUANT_ATTRS, + QCOM_QUANT_MAX, + QCOM_SCALE, +) from .node_visitor import NodeVisitor, register_node_visitor from .qnn_constants import OpBatchnorm, QNN_OP_PACKAGE_NAME_QTI_AISW @@ -21,6 +26,15 @@ class BatchNorm(NodeVisitor): def __init__(self, *args) -> None: super().__init__(*args) + def update_encoding(self, node: torch.fx.Node, tensor: torch.Tensor, eps): + if isinstance(tensor, torch._subclasses.FakeTensor): + return + + if quant_attrs := node.meta.get(QCOM_QUANT_ATTRS): + # scale value equals to zero will cause failure in HTP + diff = max(abs(tensor.max()), abs(tensor.min())) + eps + quant_attrs[QCOM_SCALE] = diff / quant_attrs[QCOM_QUANT_MAX] + def define_node( self, node: torch.fx.Node, @@ -29,7 +43,7 @@ def define_node( input_node = node.args[0] input_tensor = self.get_tensor(input_node, node) - mean_node, var_node, eps = node.args[3], node.args[4], 1e-5 + mean_node, var_node, eps = node.args[3], node.args[4], 1e-9 mean_tensor = get_parameter(mean_node, self.edge_program) var_tensor = get_parameter(var_node, self.edge_program) @@ -48,6 +62,7 @@ def define_node( amount = (filter_tensor * mean_tensor) / torch.sqrt(var_tensor + eps) bias_tensor = bias_tensor - amount + self.update_encoding(bias_node, bias_tensor, eps) bias_tensor_wrapper = self.define_tensor( bias_node, bias_tensor, @@ -57,6 +72,7 @@ def define_node( ) filter_tensor = filter_tensor / torch.sqrt(var_tensor + eps) + self.update_encoding(filter_node, filter_tensor, eps) filter_tensor_wrapper = self.define_tensor( filter_node, filter_tensor, diff --git a/backends/qualcomm/builders/op_conv2d.py b/backends/qualcomm/builders/op_conv2d.py index 909cc6a21f6..4b58edbac63 100644 --- a/backends/qualcomm/builders/op_conv2d.py +++ b/backends/qualcomm/builders/op_conv2d.py @@ -10,16 +10,7 @@ import numpy as np import torch -from executorch.backends.qualcomm.utils.constants import ( - QCOM_DATA, - QCOM_DTYPE, - QCOM_QUANT_ATTRS, - QCOM_QUANT_MAX, - QCOM_QUANT_MIN, - QCOM_SCALE, - QCOM_ZERO_POINT, -) -from executorch.exir.dialects._ops import ops as exir_ops +from executorch.backends.qualcomm.utils.constants import QCOM_DATA from .node_visitor import NodeVisitor, register_node_visitor from .qnn_constants import ( @@ -94,52 +85,6 @@ def _add_conv_op_parameter( return conv_op - def _get_bias_tensor( - self, - node: torch.fx.Node, - nodes_to_wrappers: Dict[str, PyQnnWrapper.TensorWrapper], - num_output_channel: int, - ) -> PyQnnWrapper.PyQnnOpWrapper: - # build dummy node if bias is not given - bias_node = ( - node.args[2] - if node.args[2] is not None - else torch.fx.Node( - node.graph, - node.name + "_runtime_bias", - "call_function", - exir_ops.edge.aten.full.default, - (), # args - {}, # kwargs - ) - ) - # zeros tensor to meet HTP constraint if bias is not given - bias_tensor = ( - get_parameter(bias_node, self.edge_program) - if node.args[2] is not None - else torch.zeros(num_output_channel) - ) - # insert quant attribute to meet HTP constraint if bias is not given - if ( - node.args[2] is None - and (bias_quant_attrs := node.meta.get(QCOM_QUANT_ATTRS)) is not None - ): - quant_attrs = bias_quant_attrs.copy() - quant_attrs[QCOM_ZERO_POINT] = 0 - quant_attrs[QCOM_SCALE] = 0 - quant_attrs[QCOM_DTYPE] = torch.int32 - quant_attrs[QCOM_QUANT_MAX] = torch.iinfo(torch.int32).max - quant_attrs[QCOM_QUANT_MIN] = torch.iinfo(torch.int32).min + 1 - bias_node.meta[QCOM_QUANT_ATTRS] = quant_attrs - - return self.define_tensor( - bias_node, - bias_tensor, - PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC, - nodes_to_wrappers, - is_input_tensor=False, - ) - def _define_conv1d( self, node: torch.fx.Node, @@ -204,9 +149,17 @@ def _define_conv1d( is_input_tensor=False, ) conv_input_tensors = [unsqueeze_output_tensor_wrapper, filter_tensor_wrapper] - conv_input_tensors.append( - self._get_bias_tensor(node, nodes_to_wrappers, filter_tensor.shape[-1]) - ) + if node.args[2] is not None: + bias_node = node.args[2] + bias_tensor = get_parameter(bias_node, self.edge_program) + bias_tensor_wrapper = self.define_tensor( + bias_node, + bias_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC, + nodes_to_wrappers, + is_input_tensor=False, + ) + conv_input_tensors.append(bias_tensor_wrapper) stride = [1] + cast(List[int], node.args[3]) padding = [0] + cast(List[int], node.args[4]) @@ -312,9 +265,18 @@ def define_node( is_input_tensor=False, ) conv_input_tensors = [input_tensor_wrapper, filter_tensor_wrapper] - conv_input_tensors.append( - self._get_bias_tensor(node, nodes_to_wrappers, filter_tensor.shape[-1]) - ) + + if node.args[2] is not None: + bias_node = node.args[2] + bias_tensor = get_parameter(bias_node, self.edge_program) + bias_tensor_wrapper = self.define_tensor( + bias_node, + bias_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC, + nodes_to_wrappers, + is_input_tensor=False, + ) + conv_input_tensors.append(bias_tensor_wrapper) output_tensor = self.get_tensor(node, node) output_tensor_wrapper = self.define_tensor( diff --git a/backends/qualcomm/builders/op_rms_norm.py b/backends/qualcomm/builders/op_rms_norm.py new file mode 100644 index 00000000000..e99b1f47ba1 --- /dev/null +++ b/backends/qualcomm/builders/op_rms_norm.py @@ -0,0 +1,127 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Dict + +import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper +import numpy as np + +import torch +from executorch.backends.qualcomm.builders.utils import get_parameter +from executorch.backends.qualcomm.utils.constants import QCOM_DATA, QCOM_QUANT_ATTRS +from executorch.exir.dialects._ops import ops as exir_ops + +from .node_visitor import NodeVisitor, register_node_visitor +from .qnn_constants import OpRmsNorm, QNN_OP_PACKAGE_NAME_QTI_AISW + + +@register_node_visitor +class RmsNormVisitor(NodeVisitor): + target = ["aten.rms_norm.default"] + + def __init__(self, *args) -> None: + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper], + ) -> PyQnnWrapper.PyQnnOpWrapper: + # args of node : ['input', 'normalized_shape', 'weight', 'eps'] + input_node = node.args[0] + input_tensor = self.get_tensor(input_node, node) + input_tensor_wrapper = self.define_tensor( + input_node, + input_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, + nodes_to_wrappers, + is_input_tensor=True, + ) + + # should be a immutable list + normalized_shapes = node.args[1] + if ( + len(normalized_shapes) != 1 + and normalized_shapes[0] != input_tensor.shape[-1] + ): + print("Only supports normalization with last input dimension") + return + axes = [node.args[0].meta["val"].dim() - 1] + axes_shape = [len(axes)] + + weight_node = node.args[2] + weight_tensor = get_parameter(weight_node, self.edge_program) + weight_tensor_wrapper = self.define_tensor( + weight_node, + weight_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC, + nodes_to_wrappers, + is_input_tensor=False, + ) + + # Fake node, nn moudle seems to be inconsistant with document + bias_tensor = torch.zeros(weight_tensor.shape) + bias_node = torch.fx.Node( + node.graph, + node.name + "_runtime_bias", + "call_function", + exir_ops.edge.aten.tensor.default, + (), # args + {}, # kwargs + ) + if quant_attrs := node.meta.get(QCOM_QUANT_ATTRS): + bias_node.meta[QCOM_QUANT_ATTRS] = quant_attrs + bias_tensor_wrapper = self.define_tensor( + bias_node, + bias_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC, + nodes_to_wrappers, + is_input_tensor=False, + ) + + epsilon = node.args[3] + if isinstance(epsilon, torch.fx.Node): + epsilon = get_parameter(epsilon, self.edge_program) + epsilon = ( + epsilon + if isinstance(epsilon, float) + else torch.finfo(epsilon.dtype).eps + ) + + output_tensor = self.get_tensor(node, node) + output_tensor_wrapper = self.define_tensor( + node, + output_tensor, + PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE, + nodes_to_wrappers, + is_input_tensor=False, + ) + + rms_nrom_op = PyQnnWrapper.PyQnnOpWrapper( + node.name, + QNN_OP_PACKAGE_NAME_QTI_AISW, + OpRmsNorm.op_name, + ) + + rms_nrom_op.AddInputTensors( + [input_tensor_wrapper, weight_tensor_wrapper, bias_tensor_wrapper] + ) + rms_nrom_op.AddOutputTensors([output_tensor_wrapper]) + rms_nrom_op.AddScalarParam( + OpRmsNorm.param_epsilon, + PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_FLOAT_32, + {QCOM_DATA: np.float32(epsilon)}, + ) + rms_nrom_op.AddTensorParam( + OpRmsNorm.param_axes, + PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32, + len(axes_shape), + axes_shape, + np.array(axes, dtype=np.uint32), + True, + ) + + return rms_nrom_op diff --git a/backends/qualcomm/builders/op_softmax.py b/backends/qualcomm/builders/op_softmax.py index ae4c89bbb96..cda40aed458 100644 --- a/backends/qualcomm/builders/op_softmax.py +++ b/backends/qualcomm/builders/op_softmax.py @@ -17,7 +17,7 @@ @register_node_visitor class Softmax(NodeVisitor): - target = ["aten._softmax.default"] + target = ["aten._softmax.default", "aten._safe_softmax.default"] def __init__(self, *args) -> None: super().__init__(*args) diff --git a/backends/qualcomm/builders/qnn_constants.py b/backends/qualcomm/builders/qnn_constants.py index 4a87e5dbbb3..8ac702f2ad5 100644 --- a/backends/qualcomm/builders/qnn_constants.py +++ b/backends/qualcomm/builders/qnn_constants.py @@ -278,6 +278,13 @@ class OpResizeNearestNeighbor: param_half_pixel_centers: str = "half_pixel_centers" +@dataclass(init=False, frozen=True) +class OpRmsNorm: + op_name: str = "RmsNorm" + param_epsilon: str = "epsilon" + param_axes: str = "axes" + + @dataclass(init=False, frozen=True) class OpScatterNd: op_name: str = "ScatterNd" diff --git a/backends/qualcomm/partition/common_defs.py b/backends/qualcomm/partition/common_defs.py index 353169bc186..d68441c2f79 100644 --- a/backends/qualcomm/partition/common_defs.py +++ b/backends/qualcomm/partition/common_defs.py @@ -17,7 +17,11 @@ ] to_be_implemented_operator = [ - exir_ops.edge.aten.where.default, + exir_ops.edge.aten.any.dim, + exir_ops.edge.aten.eq.Scalar, + exir_ops.edge.aten.full_like.default, + exir_ops.edge.aten.logical_not.default, + exir_ops.edge.aten.where.self, ] allow_list_operator = [ diff --git a/backends/qualcomm/partition/qnn_partitioner.py b/backends/qualcomm/partition/qnn_partitioner.py index 73dbede8ff6..659bda517f0 100644 --- a/backends/qualcomm/partition/qnn_partitioner.py +++ b/backends/qualcomm/partition/qnn_partitioner.py @@ -44,16 +44,7 @@ def __init__( ): self.node_visitors = node_visitor.get_node_visitors(edge_program) - self.skip_node_op_builder_set = set() - if skip_node_op_set is not None: - self.skip_node_op_builder_set = set( - [ - self.node_visitors[val] - for val in skip_node_op_set - if val in self.node_visitors - ] - ) - + self.skip_node_op_set = skip_node_op_set self.skip_node_id_set = skip_node_id_set self.nodes_to_wrappers = defaultdict(dict) self.qnn_manager = PyQnnManager.QnnManager( @@ -75,14 +66,9 @@ def is_node_supported(self, _, node: torch.fx.Node) -> bool: if node.target in allow_list_operator: return True - if self.skip_node_id_set is not None and node.name in self.skip_node_id_set: - print(f"[QNN Partitioner Op Support]: {node.target.__name__} | Skipped") - return False - if ( - self.skip_node_op_builder_set is not None - and self.node_visitors[node.target.__name__] - in self.skip_node_op_builder_set + node.name in self.skip_node_id_set + or node.target.__name__ in self.skip_node_op_set ): print(f"[QNN Partitioner Op Support]: {node.target.__name__} | Skipped") return False @@ -124,8 +110,8 @@ def __init__( QnnBackend.__name__, self.compiler_specs_snapshot ) self.partition_tags: Dict[str, DelegationSpec] = {} - self.skip_node_id_set = skip_node_id_set - self.skip_node_op_set = skip_node_op_set + self.skip_node_id_set = set() if skip_node_id_set is None else skip_node_id_set + self.skip_node_op_set = set() if skip_node_op_set is None else skip_node_op_set def generate_partitions( self, edge_program: torch.export.ExportedProgram diff --git a/backends/qualcomm/passes/annotate_and_quant_scalar.py b/backends/qualcomm/passes/annotate_and_quant_scalar.py index 5f111ee9c8b..1db50694ece 100644 --- a/backends/qualcomm/passes/annotate_and_quant_scalar.py +++ b/backends/qualcomm/passes/annotate_and_quant_scalar.py @@ -14,7 +14,7 @@ from executorch.exir.passes import dead_code_elimination_pass from torch.fx.passes.utils.source_matcher_utils import get_source_partitions -from .utils import get_quant_attrs +from .utils import dq_ops, get_quant_attrs class AnnotateAndQuantScalar(ExportPass): @@ -78,6 +78,7 @@ def _annotate_scalar_node( float, torch.float32, torch.int32, + torch.int64, ]: return @@ -88,30 +89,43 @@ def _traverse_binary_node(self, graph_module: torch.fx.GraphModule): graph_module.graph, self.binary_op_sources ) src_partitions = list(itertools.chain(*src_partitions.values())) + processed = set() for src_partition in src_partitions: - output = src_partition.output_nodes[0] - if ( - output.meta.get(QCOM_QUANT_ATTRS) - and len(src_partition.input_nodes) == 1 - ): - dq_node = src_partition.input_nodes[0] - q_node = dq_node.args[0] - q_node_attrs = get_quant_attrs(graph_module, q_node) - - scalar_nodes = [n for n in output.args if n != dq_node] - if len(scalar_nodes) == 0: + # need post process here to identify partitioned nodes: + src_fn_dict = {} + for n in src_partition.nodes: + # e.g. + # meta["source_fn_stack"]: [('mul', )] + # we'll use as grouping key + node_list = src_fn_dict.setdefault(n.meta["source_fn_stack"][-1][1], []) + node_list.append(n) + + for nodes in src_fn_dict.values(): + output = [n for n in nodes if n in src_partition.output_nodes][0] + # if all args have been annotated, it shouldn't be a scalar operation + if all(arg.target in dq_ops for arg in output.args): continue - scalar_node = scalar_nodes[0] - source_scalar_node = self._get_source_scalar_node(scalar_node) - # we'll abandon cast op here, since the constant scalar will - # be pre-loaded into QNN context binary - output.replace_input_with(scalar_node, source_scalar_node) + if output not in processed and QCOM_QUANT_ATTRS in output.meta: + dq_node = [n for n in output.args if n.target in dq_ops][0] + q_node = dq_node.args[0] + q_node_attrs = get_quant_attrs(graph_module, q_node) + + scalar_nodes = [n for n in output.args if n != dq_node] + if len(scalar_nodes) == 0: + continue + + scalar_node = scalar_nodes[0] + source_scalar_node = self._get_source_scalar_node(scalar_node) + # we'll abandon cast op here, since the constant scalar will + # be pre-loaded into QNN context binary + output.replace_input_with(scalar_node, source_scalar_node) - scalar_quant_attrs = self._update_scalar_node_attrs( - source_scalar_node, q_node_attrs - ) - self._annotate_scalar_node(source_scalar_node, scalar_quant_attrs) + scalar_quant_attrs = self._update_scalar_node_attrs( + source_scalar_node, q_node_attrs + ) + self._annotate_scalar_node(source_scalar_node, scalar_quant_attrs) + processed.add(output) def call(self, graph_module: torch.fx.GraphModule): self._traverse_binary_node(graph_module) diff --git a/backends/qualcomm/passes/i64_to_i32.py b/backends/qualcomm/passes/i64_to_i32.py index 7814a3ff0d6..1d2171cc37a 100644 --- a/backends/qualcomm/passes/i64_to_i32.py +++ b/backends/qualcomm/passes/i64_to_i32.py @@ -5,7 +5,9 @@ # LICENSE file in the root directory of this source tree. import torch from executorch.backends.qualcomm.builders.utils import get_parameter, is_constant +from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult +from torch._subclasses.fake_tensor import FakeTensor class I64toI32(ExportPass): @@ -16,6 +18,8 @@ class I64toI32(ExportPass): def __init__(self, edge_program: torch.export.ExportedProgram): super(I64toI32, self).__init__() self.edge_program = edge_program + # pyre-ignore[4] + self.copy_op = exir_ops.edge.aten._to_copy.default def _update_meta(self, node: torch.fx.node) -> None: meta_val = node.meta["val"] @@ -32,6 +36,10 @@ def _update_meta(self, node: torch.fx.node) -> None: if meta_val.dtype == torch.int64: node.meta["val"] = meta_val.to(torch.float) + # pyre-ignore[2] + def _is_tensor_of_dtype(self, node_val, dtype: torch.dtype) -> bool: + return isinstance(node_val, FakeTensor) and node_val.dtype == dtype + def _cast_to_int32(self, graph_module: torch.fx.GraphModule): for n in graph_module.graph.nodes: if is_constant(n, self.edge_program): @@ -39,6 +47,22 @@ def _cast_to_int32(self, graph_module: torch.fx.GraphModule): if param.dtype == torch.int64: # QNN does not support int64 self._update_meta(n) + elif n.op == "placeholder": + node_val = n.meta["val"] + if self._is_tensor_of_dtype(node_val, torch.int64): + with graph_module.graph.inserting_after(n): + args = (n,) + to_dst_node = graph_module.graph.create_node( + "call_function", + self.copy_op, + args, + {"dtype": torch.int32}, + ) + to_dst_node.meta["val"] = node_val.to(torch.int32) + + # Replace usage of the src dtype result with the dst dtype result. + n.replace_all_uses_with(to_dst_node) + to_dst_node.args = (n,) def call(self, graph_module: torch.fx.GraphModule): self._cast_to_int32(graph_module) diff --git a/backends/qualcomm/passes/recompose_pixel_shuffle.py b/backends/qualcomm/passes/recompose_pixel_shuffle.py deleted file mode 100644 index 9eec6bfa264..00000000000 --- a/backends/qualcomm/passes/recompose_pixel_shuffle.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) Qualcomm Innovation Center, Inc. -# All rights reserved -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. -import torch -from executorch.exir.dialects._ops import ops as exir_ops -from executorch.exir.pass_base import ExportPass, PassResult -from torch.fx.passes.utils.source_matcher_utils import get_source_partitions - - -class RecomposePixelShuffle(ExportPass): - """ - Merge decomposed operators back to one super node. - """ - - def __init__(self): - super().__init__() - - def call(self, graph_module: torch.fx.GraphModule): - graph = graph_module.graph - # decomposed core aten ops - partitions = get_source_partitions(graph, [torch.nn.PixelShuffle]) - for _, src_partitions in partitions.items(): - for src_partition in src_partitions: - input_node = src_partition.input_nodes[0] - output_node = src_partition.output_nodes[0] - with graph.inserting_after(input_node): - h_in_shape = input_node.meta["val"].shape[2] - h_out_shape = output_node.meta["val"].shape[2] - upscale_factor = h_out_shape / h_in_shape - - pixel_shuffle_node = graph.create_node( - "call_function", - exir_ops.edge.aten.pixel_shuffle.default, - (input_node, int(upscale_factor)), - ) - users = output_node.users.copy() - for user in users: - user.replace_input_with(output_node, pixel_shuffle_node) - # copy metadata - pixel_shuffle_node.meta = output_node.meta - - graph.eliminate_dead_code() - graph_module.recompile() - return PassResult(graph_module, True) diff --git a/backends/qualcomm/passes/recompose_pixel_unshuffle.py b/backends/qualcomm/passes/recompose_pixel_unshuffle.py index a47f3d119a5..00d46639089 100644 --- a/backends/qualcomm/passes/recompose_pixel_unshuffle.py +++ b/backends/qualcomm/passes/recompose_pixel_unshuffle.py @@ -6,7 +6,6 @@ import torch from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult -from torch.fx.passes.utils.source_matcher_utils import get_source_partitions class RecomposePixelUnshuffle(ExportPass): @@ -85,30 +84,6 @@ def call(self, graph_module: torch.fx.GraphModule): # copy metadata pixel_unshuffle_node.meta = node.meta - # decomposed core aten ops - if not self.quantization_capture: - partitions = get_source_partitions(graph, [torch.nn.PixelUnshuffle]) - for _, src_partitions in partitions.items(): - for src_partition in src_partitions: - input_node = src_partition.input_nodes[0] - output_node = src_partition.output_nodes[0] - with graph.inserting_after(input_node): - h_in_shape = input_node.meta["val"].shape[2] - h_out_shape = output_node.meta["val"].shape[2] - downscale_factor = h_in_shape / h_out_shape - - op = self.op - pixel_unshuffle_node = graph.create_node( - "call_function", - op, - (input_node, int(downscale_factor)), - ) - users = output_node.users.copy() - for user in users: - user.replace_input_with(output_node, pixel_unshuffle_node) - # copy metadata - pixel_unshuffle_node.meta = output_node.meta - graph.eliminate_dead_code() graph_module.recompile() return PassResult(graph_module, True) diff --git a/backends/qualcomm/passes/recompose_rms_norm.py b/backends/qualcomm/passes/recompose_rms_norm.py new file mode 100644 index 00000000000..b26de8bd794 --- /dev/null +++ b/backends/qualcomm/passes/recompose_rms_norm.py @@ -0,0 +1,76 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import torch +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, PassResult +from torch.fx.passes.utils.source_matcher_utils import get_source_partitions + +from .utils import dq_ops + + +class RecomposeRmsNorm(ExportPass): + """ + Merge decomposed operators back to one super node. + """ + + def __init__(self): + super().__init__() + + def _get_eps_node(self, nodes): + # eps: one of inputs of add node + add_node = [n for n in nodes if hasattr(n, "name") and "add" in n.name][0] + for a in add_node.args: + if isinstance(a, float) or a.op != "call_function": + return a + + def _get_gamma_node(self, output_node): + # gamma: one of inputs of output node + for a in output_node.args: + if a.op != "call_function" or a.target in dq_ops: + return a + + def call(self, graph_module: torch.fx.GraphModule): + graph = graph_module.graph + partitions = get_source_partitions(graph, [torch.nn.RMSNorm]) + for _, src_partitions in partitions.items(): + for src_partition in src_partitions: + input_len = len(src_partition.input_nodes) + if input_len == 1: + input_node = src_partition.input_nodes[0] + elif input_len == 2: + inp_0, inp_1 = src_partition.input_nodes + input_node = inp_0 if len(inp_0.users) == 2 else inp_1 + else: + raise RuntimeError( + f"Found a edge case of rms_node partitoin {src_partition}, which has {input_len} inputs" + ) + + output_node = src_partition.output_nodes[0] + eps_node = self._get_eps_node(src_partition.nodes) + gamma_node = self._get_gamma_node(output_node) + + with graph.inserting_before(output_node): + # args schema + # (Tensor input, int[] normalized_shape, Tensor? weight=None, float? eps=None) -> Tensor + rms_node = graph.create_node( + "call_function", + exir_ops.edge.aten.rms_norm.default, + ( + input_node, + list(gamma_node.meta["val"].shape), + gamma_node, + eps_node, + ), + ) + users = output_node.users.copy() + for user in users: + user.replace_input_with(output_node, rms_node) + # copy metadata + rms_node.meta = output_node.meta + + graph.eliminate_dead_code() + graph_module.recompile() + return PassResult(graph_module, True) diff --git a/backends/qualcomm/passes/replace_index_put_input.py b/backends/qualcomm/passes/replace_index_put_input.py new file mode 100644 index 00000000000..1eb210cf67e --- /dev/null +++ b/backends/qualcomm/passes/replace_index_put_input.py @@ -0,0 +1,54 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import torch +from executorch.backends.qualcomm.utils.constants import QCOM_ENCODING, QCOM_QUANT_ATTRS +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, PassResult + + +class ReplaceIndexPutInput(ExportPass): + """ + Index put input workaround for quantized module + """ + + dq_q_map = { + # per tensor + exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: exir_ops.edge.quantized_decomposed.quantize_per_tensor.default, + exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor: exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor, + # per channel + exir_ops.edge.quantized_decomposed.dequantize_per_channel.default: exir_ops.edge.quantized_decomposed.quantize_per_channel.default, + } + + def __init__(self, edge_program: torch.export.ExportedProgram): + super(ReplaceIndexPutInput, self).__init__() + self.edge_program = edge_program + + def call(self, graph_module: torch.fx.GraphModule): + graph = graph_module.graph + for node in graph.nodes: + if node.target == exir_ops.edge.aten.index_put.default: + if ( + copy_node := list(node.users)[0] + ) and copy_node.target == exir_ops.edge.aten.copy.default: + m_buffer_node = copy_node.args[0] + bad_frozen_node = node.args[0] + if QCOM_QUANT_ATTRS in bad_frozen_node.meta: + m_buffer_node.meta[QCOM_QUANT_ATTRS] = bad_frozen_node.meta[ + QCOM_QUANT_ATTRS + ] + m_buffer_node.meta[QCOM_QUANT_ATTRS][QCOM_ENCODING] = ( + self.dq_q_map[ + m_buffer_node.meta[QCOM_QUANT_ATTRS][QCOM_ENCODING] + ] + ) + with graph.inserting_after(bad_frozen_node): + node.replace_input_with(bad_frozen_node, m_buffer_node) + else: + continue + + graph.eliminate_dead_code() + graph_module.recompile() + return PassResult(graph_module, True) diff --git a/backends/qualcomm/quantizer/custom_annotation.py b/backends/qualcomm/quantizer/custom_annotation.py index b2c86e50d33..9cde50b9c70 100644 --- a/backends/qualcomm/quantizer/custom_annotation.py +++ b/backends/qualcomm/quantizer/custom_annotation.py @@ -91,15 +91,17 @@ def is_edge_condition(node: Node): def annotate_matmul_input1(node: Node, quantization_config: QuantizationConfig): if is_edge_condition(node): return - if node.target == torch.ops.aten.index_put_.default: + if node.target in [ + torch.ops.aten.index_put.default, + torch.ops.aten.index_put_.default, + ]: annotate_index_put(node, quantization_config) annotate_matmul_input1(node.args[0], quantization_config) elif node.target == torch.ops.aten.cat.default: annotate_cat(node, quantization_config) # Expect that the inputs of the cat op are select ops - for arg in node.args[0][1:]: - annotate_single_in_single_out(arg, quantization_config) - annotate_matmul_input1(node.args[0][0], quantization_config) + for arg in node.args[0]: + annotate_matmul_input1(arg, quantization_config) else: annotate_single_in_single_out(node, quantization_config) annotate_matmul_input1(node.args[0], quantization_config) diff --git a/backends/qualcomm/quantizer/quantizer.py b/backends/qualcomm/quantizer/quantizer.py index d51e016473f..e27edf939c8 100644 --- a/backends/qualcomm/quantizer/quantizer.py +++ b/backends/qualcomm/quantizer/quantizer.py @@ -116,7 +116,7 @@ def _update_per_channel_weight_quant_ops(self, ops: Set[OpOverload], enable: boo if enable: self.use_per_channel_weight_quant_ops.update(ops) else: - self.use_per_channel_weight_quant_ops.difference(ops) + self.use_per_channel_weight_quant_ops.difference_update(ops) def add_16bit_quant_ops(self, ops: Set[OpOverload]) -> None: for op in ops: diff --git a/backends/qualcomm/quantizer/utils.py b/backends/qualcomm/quantizer/utils.py index d31b4753a3d..d3ae1194acd 100644 --- a/backends/qualcomm/quantizer/utils.py +++ b/backends/qualcomm/quantizer/utils.py @@ -4,6 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. import numbers +import operator from dataclasses import dataclass from functools import partial from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple @@ -77,7 +78,7 @@ def _derive_bias_qparams_fn( def get_default_8bit_qnn_ptq_config( - act_symmetric: bool = False, act_observer=MinMaxObserver + act_symmetric: bool = False, act_observer=MovingAverageMinMaxObserver ) -> QuantizationConfig: extra_args: Dict[str, Any] = {"eps": 2**-12} @@ -96,7 +97,7 @@ def get_default_8bit_qnn_ptq_config( quant_max=torch.iinfo(torch.int8).max, qscheme=torch.per_tensor_symmetric, ch_axis=0, - observer_or_fake_quant_ctr=act_observer.with_args(**extra_args), + observer_or_fake_quant_ctr=MinMaxObserver.with_args(**extra_args), ) bias_quantization_spec = QuantizationSpec( @@ -104,7 +105,7 @@ def get_default_8bit_qnn_ptq_config( quant_min=torch.iinfo(torch.int32).min, quant_max=torch.iinfo(torch.int32).max, qscheme=torch.per_tensor_symmetric, - observer_or_fake_quant_ctr=act_observer.with_args(**extra_args), + observer_or_fake_quant_ctr=MinMaxObserver.with_args(**extra_args), ) quantization_config = QuantizationConfig( @@ -619,7 +620,13 @@ def annotate_upsample_nearest2d( annotate_single_in_single_out(node, quantization_config) -@register_annotator([torch.ops.aten.softmax.int, torch.ops.aten._softmax.default]) +@register_annotator( + [ + torch.ops.aten.softmax.int, + torch.ops.aten._softmax.default, + torch.ops.aten._safe_softmax.default, + ] +) def annotate_softmax(node: Node, quantization_config: QuantizationConfig) -> None: annotate_single_in_single_out(node, quantization_config) @@ -684,6 +691,31 @@ def annotate_squeeze(node: Node, quantization_config: QuantizationConfig) -> Non annotate_single_in_single_out(node, quantization_config) +@register_annotator([torch.ops.aten.rms_norm.default]) +def annotate_rms_norm(node: Node, quantization_config: QuantizationConfig) -> None: + act_node = node.args[0] + weight_node = node.args[2] + + if _is_annotated([node]): + return + + # TODO current only support 16a16w + _annotate_input_qspec_map( + node, + act_node, + quantization_config.input_activation, + ) + + _annotate_input_qspec_map( + node, + weight_node, + quantization_config.input_activation, + ) + nodes_to_mark_annotated = [node] + _annotate_output_qspec(node, quantization_config.output_activation) + _mark_nodes_as_annotated(nodes_to_mark_annotated) + + @register_annotator([torch.ops.aten.rsqrt.default]) def annotate_rsqrt(node: Node, quantization_config: QuantizationConfig) -> None: annotate_single_in_single_out(node, quantization_config) @@ -975,6 +1007,38 @@ def annotate_linear(node: Node, quantization_config: QuantizationConfig) -> None node.meta["source_fn_stack"] = [(node, torch.nn.Linear)] +@register_annotator([torch.ops.aten._native_batch_norm_legit_no_training.default]) +def annotate_batch_norm(node: Node, quantization_config: QuantizationConfig) -> None: + act, weight, bias = node.args[0:3] + if _is_annotated([node]): + return + + _annotate_input_qspec_map( + node, + act, + quantization_config.input_activation, + ) + # QNN requires uint8 instead of int8 in 'weight' config + _annotate_input_qspec_map( + node, + weight, + quantization_config.input_activation, + ) + _annotate_input_qspec_map( + node, + bias, + quantization_config.bias, + ) + _annotate_output_qspec(node, quantization_config.output_activation) + _mark_nodes_as_annotated([node, *node.args[0:3]]) + + +@register_annotator([operator.getitem]) +def annotate_getitem(node: Node, quantization_config: QuantizationConfig) -> None: + _annotate_output_qspec(node, quantization_config.output_activation) + _mark_nodes_as_annotated([node]) + + @register_annotator([torch.ops.aten.layer_norm.default]) def annotate_layer_norm(node: Node, quantization_config: QuantizationConfig) -> None: act_node = node.args[0] diff --git a/backends/qualcomm/runtime/QnnExecuTorch.h b/backends/qualcomm/runtime/QnnExecuTorch.h index 45525726ca7..dabd4cdde5f 100644 --- a/backends/qualcomm/runtime/QnnExecuTorch.h +++ b/backends/qualcomm/runtime/QnnExecuTorch.h @@ -44,7 +44,7 @@ struct CustomMemTensorInfo { size_t tensor_bytes; uint32_t* shape; uint32_t rank; - torch::executor::ScalarType dtype; + exec_aten::ScalarType dtype; }; /// Allocate specific tensors (usually graph inputs and outputs) on shared diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp index 36512c4ff21..f5c9473411e 100644 --- a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp +++ b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp @@ -213,8 +213,10 @@ Error QnnExecuTorchBackend::execute( } ET_CHECK_OR_RETURN_ERROR( - qnn_manager->Execute(input_tensor_structs, output_tensor_structs) == - Error::Ok, + qnn_manager->Execute( + input_tensor_structs, + output_tensor_structs, + context.event_tracer()) == Error::Ok, Internal, "Fail to execute graph"); ET_CHECK_OR_RETURN_ERROR( diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.h b/backends/qualcomm/runtime/QnnExecuTorchBackend.h index ed4d35068dc..fbcc7058894 100644 --- a/backends/qualcomm/runtime/QnnExecuTorchBackend.h +++ b/backends/qualcomm/runtime/QnnExecuTorchBackend.h @@ -14,7 +14,8 @@ namespace torch { namespace executor { -class QnnExecuTorchBackend final : public PyTorchBackendInterface { +class QnnExecuTorchBackend final + : public ::executorch::runtime::BackendInterface { public: ~QnnExecuTorchBackend(){}; diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp index 3027c184d95..f4275f0ab3d 100644 --- a/backends/qualcomm/runtime/QnnManager.cpp +++ b/backends/qualcomm/runtime/QnnManager.cpp @@ -8,7 +8,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -56,9 +58,7 @@ QnnManager::QnnManager( "backend_type: %s", EnumNameQnnExecuTorchBackendType(backend_type)); QNN_EXECUTORCH_LOG_INFO("graph_name: %s", options_->graph_name()->c_str()); QNN_EXECUTORCH_LOG_INFO("library_path: %s", library_path.c_str()); - QNN_EXECUTORCH_LOG_INFO( - "tensor_dump_output_path: %s", - options_->tensor_dump_output_path()->c_str()); + QNN_EXECUTORCH_LOG_INFO("dump intermediate outputs: %s", IsTensorDump()); QNN_EXECUTORCH_LOG_INFO( "log_level: %s", EnumNameQnnExecuTorchLogLevel(options_->log_level())); QNN_EXECUTORCH_LOG_INFO( @@ -281,6 +281,8 @@ Error QnnManager::Init() { options_->backend_options()->backend_type()); backend_params_ptr_ = QnnBackendFactory().Create( qnn_loaded_backend_, logger_.get(), qnn_context_blob_, options_); + ET_CHECK_OR_RETURN_ERROR( + backend_params_ptr_ != nullptr, Internal, "Failed to load Qnn backend.") ET_CHECK_OR_RETURN_ERROR( backend_params_ptr_->qnn_backend_ptr_->Configure() == Error::Ok, Internal, @@ -363,7 +365,8 @@ Error QnnManager::AllocateTensor( Error QnnManager::Execute( const std::vector& input_tensor_structs, - std::vector& output_tensor_structs) { + std::vector& output_tensor_structs, + EventTracer* event_tracer) { Qnn_ErrorHandle_t error = QNN_SUCCESS; error = backend_params_ptr_->qnn_graph_ptr_->GraphExecute( @@ -374,30 +377,27 @@ Error QnnManager::Execute( "qnn_graph_execute failed. Error %d", QNN_GET_ERROR_CODE(error)); return Error::Internal; } - if (IsTensorDump()) { // TODO: Need to handle the graph which is partitioned. // Maybe we could use graph name. - std::string dir = options_->tensor_dump_output_path()->str() + "/Result/"; - CreateDirectory(dir); - QNN_EXECUTORCH_LOG_INFO("Dump tensor to the path: %s", dir.c_str()); for (std::size_t out_idx = 0; out_idx < output_tensor_structs.size(); ++out_idx) { const Qnn_Tensor_t& output_tensor = output_tensor_structs[out_idx]; - - std::string output_path = - dir + QNN_VER_PTR(output_tensor)->name + "_tensor.raw"; - - std::ofstream fout(output_path, std::ios::binary); - if (fout.fail()) { - QNN_EXECUTORCH_LOG_ERROR( - "Dump tensor name: %s Failed.", QNN_VER_PTR(output_tensor)->name); - return Error::Internal; - } - - fout.write( - static_cast(QNN_VER_PTR(output_tensor)->clientBuf.data), - QNN_VER_PTR(output_tensor)->clientBuf.dataSize); + std::vector sizes( + QNN_VER_PTR(output_tensor)->dimensions, + QNN_VER_PTR(output_tensor)->dimensions + + QNN_VER_PTR(output_tensor)->rank); + + auto dump_tensor = executorch::extension::from_blob( + QNN_VER_PTR(output_tensor)->clientBuf.data, + sizes, + qnn_dtype_to_scalar_type_[QNN_VER_PTR(output_tensor)->dataType]); + + torch::executor::event_tracer_log_output_delegate( + event_tracer, + QNN_VER_PTR(output_tensor)->name, + /*delegate_debug_id=*/static_cast(-1), + *dump_tensor); } } diff --git a/backends/qualcomm/runtime/QnnManager.h b/backends/qualcomm/runtime/QnnManager.h index 5190f6768b7..3d1cc3863aa 100644 --- a/backends/qualcomm/runtime/QnnManager.h +++ b/backends/qualcomm/runtime/QnnManager.h @@ -37,7 +37,8 @@ class QnnManager { Error Execute( const std::vector& input_tensor_structs, - std::vector& output_tensor_structs); + std::vector& output_tensor_structs, + EventTracer* event_tracer); Error ProfileExecuteData(EventTracer* event_tracer); @@ -52,7 +53,7 @@ class QnnManager { } bool IsTensorDump() { - return options_->tensor_dump_output_path()->size() > 0; + return options_->dump_intermediate_outputs(); } bool IsNodeSupportedByBackend( diff --git a/backends/qualcomm/runtime/SharedBuffer.cpp b/backends/qualcomm/runtime/SharedBuffer.cpp index 3fa62d09cdb..2b2a729835c 100644 --- a/backends/qualcomm/runtime/SharedBuffer.cpp +++ b/backends/qualcomm/runtime/SharedBuffer.cpp @@ -25,7 +25,7 @@ std::size_t std::hash::operator()( hash_val ^= info.shape[i]; } hash_val ^= std::hash()(info.rank); - hash_val ^= std::hash()(info.dtype); + hash_val ^= std::hash()(info.dtype); return hash_val; } diff --git a/backends/qualcomm/runtime/backends/QnnBackendCommon.cpp b/backends/qualcomm/runtime/backends/QnnBackendCommon.cpp index 3e286c07b02..c67f9b52f5d 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendCommon.cpp +++ b/backends/qualcomm/runtime/backends/QnnBackendCommon.cpp @@ -53,6 +53,85 @@ Error QnnBackend::Configure() { } return Error::Ok; } + +Error QnnBackend::VerifyQNNSDKVersion( + const QnnExecuTorchBackendType backend_id) { + const QnnInterface& qnn_interface = implementation_.GetQnnInterface(); + + Qnn_ApiVersion_t qnn_version = {QNN_VERSION_INIT}; + Qnn_ErrorHandle_t error = + qnn_interface.qnn_backend_get_api_version(&qnn_version); + if (error != QNN_SUCCESS) { + QNN_EXECUTORCH_LOG_ERROR("Failed to get Qnn API version."); + return Error::Internal; + } + + Qnn_ApiVersion_t expected_version = {QNN_VERSION_INIT}; + expected_version.coreApiVersion.major = QNN_API_VERSION_MAJOR; + expected_version.coreApiVersion.minor = QNN_API_VERSION_MINOR; + expected_version.coreApiVersion.patch = QNN_API_VERSION_PATCH; + expected_version.backendApiVersion = GetExpectedBackendVersion(); + const char* backend_type = EnumNameQnnExecuTorchBackendType(backend_id); + + Error status = VersionChecker( + qnn_version.coreApiVersion, expected_version.coreApiVersion, "Qnn API"); + if (status == Error::Ok) { + status = VersionChecker( + qnn_version.backendApiVersion, + expected_version.backendApiVersion, + backend_type); + } + + return status; +} + +Error QnnBackend::VersionChecker( + const Qnn_Version_t& qnn_version, + const Qnn_Version_t& expected, + const std::string& prefix) { + if (qnn_version.major != expected.major) { + QNN_EXECUTORCH_LOG_ERROR( + "%s version %u.%u.%u is not supported. " + "The minimum supported version is %u.%u.%u. Please make " + "sure you have the correct backend library version.", + prefix.c_str(), + qnn_version.major, + qnn_version.minor, + qnn_version.patch, + expected.major, + expected.minor, + expected.patch); + return Error::Internal; + } + if (qnn_version.major == QNN_API_VERSION_MAJOR && + qnn_version.minor < expected.minor) { + QNN_EXECUTORCH_LOG_WARN( + "%s version %u.%u.%u is mismatched. " + "The minimum supported version is %u.%u.%u. Please make " + "sure you have the correct backend library version.", + prefix.c_str(), + qnn_version.major, + qnn_version.minor, + qnn_version.patch, + expected.major, + expected.minor, + expected.patch); + } + if ((qnn_version.major == QNN_API_VERSION_MAJOR && + qnn_version.minor > expected.minor)) { + QNN_EXECUTORCH_LOG_WARN( + "%s version %u.%u.%u is used. " + "The version is tested against %u.%u.%u.", + prefix.c_str(), + qnn_version.major, + qnn_version.minor, + qnn_version.patch, + expected.major, + expected.minor, + expected.patch); + } + return Error::Ok; +} } // namespace qnn } // namespace executor } // namespace torch diff --git a/backends/qualcomm/runtime/backends/QnnBackendCommon.h b/backends/qualcomm/runtime/backends/QnnBackendCommon.h index e6ea0adff8b..de007898e5d 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendCommon.h +++ b/backends/qualcomm/runtime/backends/QnnBackendCommon.h @@ -13,8 +13,10 @@ #include +#include "HTP/QnnHtpCommon.h" #include "QnnBackend.h" #include "QnnCommon.h" +#include "QnnTypes.h" namespace torch { namespace executor { namespace qnn { @@ -43,7 +45,10 @@ class QnnBackend { return handle_; } + Error VerifyQNNSDKVersion(const QnnExecuTorchBackendType backend_id); + protected: + virtual Qnn_Version_t GetExpectedBackendVersion() const = 0; virtual Error MakeConfig(std::vector& config) { return Error::Ok; }; @@ -52,6 +57,10 @@ class QnnBackend { Qnn_BackendHandle_t handle_; const QnnImplementation& implementation_; QnnLogger* logger_; + Error VersionChecker( + const Qnn_Version_t& qnn_version, + const Qnn_Version_t& expected, + const std::string& prefix); }; } // namespace qnn } // namespace executor diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp index acb95524682..9fb292613a3 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp +++ b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp @@ -16,6 +16,7 @@ std::unique_ptr QnnBackendFactory::Create( const QnnExecuTorchContextBinary& qnn_context_blob, const QnnExecuTorchOptions* options) { auto backend_params = std::make_unique(); + switch (options->backend_options()->backend_type()) { case QnnExecuTorchBackendType::kHtpBackend: { auto htp_options = options->backend_options()->htp_options(); @@ -51,6 +52,7 @@ std::unique_ptr QnnBackendFactory::Create( } backend_params->qnn_backend_ptr_ = std::make_unique(implementation, logger); + backend_params->qnn_device_ptr_ = std::make_unique( implementation, logger, options->soc_info(), htp_options); @@ -72,7 +74,6 @@ std::unique_ptr QnnBackendFactory::Create( backend_params->qnn_mem_manager_ptr_ = std::make_unique( implementation, backend_params->qnn_context_ptr_.get()); backend_params->backend_init_state_ = BackendInitializeState::INITIALIZED; - return backend_params; } break; case QnnExecuTorchBackendType::kGpuBackend: case QnnExecuTorchBackendType::kDspBackend: @@ -81,7 +82,11 @@ std::unique_ptr QnnBackendFactory::Create( return nullptr; } - // should not reach here + if (backend_params->qnn_backend_ptr_->VerifyQNNSDKVersion( + options->backend_options()->backend_type()) == Error::Ok) { + return backend_params; + } + return nullptr; } } // namespace qnn diff --git a/backends/qualcomm/runtime/backends/QnnProfiler.cpp b/backends/qualcomm/runtime/backends/QnnProfiler.cpp index fa5829d23b8..ae336a800b6 100644 --- a/backends/qualcomm/runtime/backends/QnnProfiler.cpp +++ b/backends/qualcomm/runtime/backends/QnnProfiler.cpp @@ -7,7 +7,6 @@ */ #include -#include namespace torch { namespace executor { diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpBackend.h b/backends/qualcomm/runtime/backends/htpbackend/HtpBackend.h index d4b14178a43..d00bd50cdc3 100644 --- a/backends/qualcomm/runtime/backends/htpbackend/HtpBackend.h +++ b/backends/qualcomm/runtime/backends/htpbackend/HtpBackend.h @@ -8,7 +8,9 @@ #pragma once #include +#include "HTP/QnnHtpCommon.h" #include "HTP/QnnHtpProfile.h" +#include "QnnTypes.h" namespace torch { namespace executor { namespace qnn { @@ -24,6 +26,14 @@ class HtpBackend : public QnnBackend { event_type == QNN_HTP_PROFILE_EVENTTYPE_GRAPH_EXECUTE_ACCEL_TIME_CYCLE); } + Qnn_Version_t GetExpectedBackendVersion() const override { + Qnn_Version_t backend_version; + backend_version.major = QNN_HTP_API_VERSION_MAJOR; + backend_version.minor = QNN_HTP_API_VERSION_MINOR; + backend_version.patch = QNN_HTP_API_VERSION_PATCH; + return backend_version; + } + protected: Error MakeConfig(std::vector& config) override { return Error::Ok; diff --git a/backends/qualcomm/runtime/targets.bzl b/backends/qualcomm/runtime/targets.bzl index 77449e95e2a..61650fab268 100644 --- a/backends/qualcomm/runtime/targets.bzl +++ b/backends/qualcomm/runtime/targets.bzl @@ -63,5 +63,6 @@ def define_common_targets(): "//executorch/backends/qualcomm/aot/wrappers:wrappers", "//executorch/runtime/backend:interface", "//executorch/runtime/core:core", + "//executorch/extension/tensor:tensor", ], ) diff --git a/backends/qualcomm/scripts/build.sh b/backends/qualcomm/scripts/build.sh index aafd6252e79..4cb2f50bbd2 100755 --- a/backends/qualcomm/scripts/build.sh +++ b/backends/qualcomm/scripts/build.sh @@ -4,6 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. set -e +set -o xtrace if [[ -z ${QNN_SDK_ROOT} ]]; then echo "Please export QNN_SDK_ROOT=/path/to/qnn_sdk" @@ -70,7 +71,7 @@ if [ "$BUILD_AARCH64" = true ]; then rm -rf $BUILD_ROOT && mkdir $BUILD_ROOT else # Force rebuild flatccrt for the correct platform - cd $BUILD_ROOT/sdk && make clean + cd $BUILD_ROOT/devtools && make clean fi cd $BUILD_ROOT @@ -78,8 +79,9 @@ if [ "$BUILD_AARCH64" = true ]; then -DCMAKE_INSTALL_PREFIX=$BUILD_ROOT \ -DCMAKE_BUILD_TYPE=$BUILD_TYPE \ -DEXECUTORCH_BUILD_QNN=ON \ - -DEXECUTORCH_BUILD_SDK=ON \ + -DEXECUTORCH_BUILD_DEVTOOLS=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ -DQNN_SDK_ROOT=$QNN_SDK_ROOT \ -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \ @@ -112,7 +114,7 @@ if [ "$BUILD_X86_64" = true ]; then rm -rf $BUILD_ROOT && mkdir $BUILD_ROOT else # Force rebuild flatccrt for the correct platform - cd $BUILD_ROOT/sdk && make clean + cd $BUILD_ROOT/devtools && make clean fi cd $BUILD_ROOT @@ -121,8 +123,9 @@ if [ "$BUILD_X86_64" = true ]; then -DCMAKE_INSTALL_PREFIX=$BUILD_ROOT \ -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \ -DEXECUTORCH_BUILD_QNN=ON \ - -DEXECUTORCH_BUILD_SDK=ON \ + -DEXECUTORCH_BUILD_DEVTOOLS=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \ -S $PRJ_ROOT \ diff --git a/backends/qualcomm/serialization/qnn_compile_spec_schema.py b/backends/qualcomm/serialization/qnn_compile_spec_schema.py index 338f61997ea..8471aad982d 100644 --- a/backends/qualcomm/serialization/qnn_compile_spec_schema.py +++ b/backends/qualcomm/serialization/qnn_compile_spec_schema.py @@ -129,7 +129,7 @@ class QnnExecuTorchOptions: library_path: str = "" log_level: QnnExecuTorchLogLevel = QnnExecuTorchLogLevel.kLogOff online_prepare: bool = False - tensor_dump_output_path: str = "" + dump_intermediate_outputs: bool = False profile_level: QnnExecuTorchProfileLevel = QnnExecuTorchProfileLevel.kProfileOff shared_buffer: bool = False is_from_context_binary: bool = False diff --git a/backends/qualcomm/serialization/schema.fbs b/backends/qualcomm/serialization/schema.fbs index 4288c83b130..4e7fdb56e89 100644 --- a/backends/qualcomm/serialization/schema.fbs +++ b/backends/qualcomm/serialization/schema.fbs @@ -164,11 +164,9 @@ table QnnExecuTorchOptions { /// Check if on-device graph construction. Default is false. online_prepare:bool; - /// Tensor dump output path. If a path is given, Delegate would write - /// outputs of each OP there. - /// In ALL cases, we don't recommend to set this option. - /// This option exist just for debugging some accuracy issues. - tensor_dump_output_path:string; + /// If tensor dump is enabled, all intermediate tensors output will be dumped. + /// This option exists for debugging accuracy issues. Default is off. + dump_intermediate_outputs:bool; /// Profiling level of the delegate and the backend. Default is off. profile_level:QnnExecuTorchProfileLevel; diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py index 319cc6092cd..e448a219284 100644 --- a/backends/qualcomm/tests/models.py +++ b/backends/qualcomm/tests/models.py @@ -55,6 +55,16 @@ def forward(self, x): return self.avgPool(x) +class BatchNorm(torch.nn.Module): + def __init__(self, n_features): + super().__init__() + self.native_batchnorm = torch.nn.BatchNorm2d(n_features) + self.eval() + + def forward(self, x): + return self.native_batchnorm(x) + + class Bmm(torch.nn.Module): def __init__(self): super().__init__() @@ -734,6 +744,16 @@ def forward(self, x): ) +class RmsNorm(torch.nn.Module): + def __init__(self): + super().__init__() + self.eps = 1e-5 + self.rms = torch.nn.RMSNorm([4], 1e-5) + + def forward(self, x): + return self.rms(x) + + class Rsqrt(torch.nn.Module): def __init__(self): super().__init__() diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index dd704c35c08..d022ac96c48 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -16,6 +16,7 @@ from executorch.backends.qualcomm.tests.utils import ( generate_context_binary, QnnPartitioner, + QnnQuantizer, QuantDtype, TestQNN, to_backend, @@ -33,6 +34,7 @@ from_context_binary, generate_htp_compiler_spec, generate_qnn_executorch_compiler_spec, + skip_annotation, ) from executorch.examples.qualcomm.utils import setup_common_args_and_variables @@ -50,8 +52,8 @@ from executorch.examples.models.mobilenet_v3 import MV3Model from executorch.examples.models.torchvision_vit.model import TorchVisionViTModel from executorch.examples.models.wav2letter import Wav2LetterModel +from executorch.exir import to_edge from executorch.exir.backend.backend_api import disable_validation -from executorch.exir.program._program import EdgeCompileConfig, ExirExportedProgram class TestQNNFloatingPointOperator(TestQNN): @@ -66,7 +68,7 @@ def setUp(self): debug=False, saver=False, online_prepare=TestQNN.online_prepare, - tensor_dump_output_path="", + dump_intermediate_outputs=TestQNN.dump_intermediate_outputs, profile=TestQNN.enable_profile, shared_buffer=TestQNN.shared_buffer, ) @@ -81,6 +83,11 @@ def test_qnn_backend_avg_pool2d(self): sample_input = (torch.randn(1, 3, 2, 2),) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_batch_norm(self): + module = BatchNorm(32) # noqa: F405 + sample_input = (torch.randn([4, 32, 16, 16]),) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_bmm(self): module = Bmm() # noqa: F405 torch.manual_seed(8) @@ -291,7 +298,6 @@ def test_qnn_backend_layer_norm(self): sample_input = (torch.randn(196, 768),) self.lower_module_and_test_output(module, sample_input) - @unittest.skip("only works on QNN 2.17") def test_qnn_backend_leaky_relu(self): test_comb = [ { @@ -334,14 +340,12 @@ def test_qnn_backend_mean_dim(self): with self.subTest(i=i): self.lower_module_and_test_output(module, sample_input) - @unittest.skip("it will hang in runtime") + @unittest.skip("failed to lower in QNN 2.26") def test_qnn_backend_mha(self): module = MultiheadAttention() # noqa: F405 sample_input = (torch.randn(1, 197, 96),) self.lower_module_and_test_output(module, sample_input) - # fp16 pad op might hit corner case in runtime - @unittest.expectedFailure def test_qnn_backend_pad(self): module = Pad() # noqa: F405 sample_input = (torch.randn([1, 8, 128]),) @@ -362,7 +366,6 @@ def test_qnn_backend_pow_tensor_scalar(self): sample_input = (torch.rand([2, 4, 3, 3]),) self.lower_module_and_test_output(module, sample_input) - @unittest.skip("only works on QNN 2.17") def test_qnn_backend_prelu(self): test_comb = [ { @@ -393,6 +396,11 @@ def test_qnn_backend_reshape(self): sample_input = (torch.randn([3, 4]),) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_rms_norm(self): + module = RmsNorm() # noqa: F405 + sample_input = (torch.abs(torch.randn([1, 1, 1, 4])),) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_rsqrt(self): module = Rsqrt() # noqa: F405 sample_input = (torch.abs(torch.randn([3, 4])),) @@ -482,7 +490,7 @@ def setUp(self): debug=False, saver=False, online_prepare=TestQNN.online_prepare, - tensor_dump_output_path="", + dump_intermediate_outputs=TestQNN.dump_intermediate_outputs, profile=TestQNN.enable_profile, shared_buffer=TestQNN.shared_buffer, ) @@ -596,7 +604,7 @@ def setUp(self): debug=False, saver=False, online_prepare=TestQNN.online_prepare, - tensor_dump_output_path="", + dump_intermediate_outputs=TestQNN.dump_intermediate_outputs, profile=TestQNN.enable_profile, shared_buffer=TestQNN.shared_buffer, ) @@ -621,6 +629,7 @@ def test_qnn_backend_16a4w_linear(self): ) self.lower_module_and_test_output(module, sample_input) + @unittest.skip("segfault happens in QNN 2.26") def test_qnn_backend_16a4w_per_channel_linear(self): module = Linear(use_bias=False) # noqa: F405 sample_input = (torch.randn([3, 4]),) @@ -655,6 +664,12 @@ def test_qnn_backend_avg_pool2d(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_batch_norm(self): + module = BatchNorm(32) # noqa: F405 + sample_input = (torch.randn([4, 32, 16, 16]),) + module = self.get_qdq_module(module, sample_input) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_bmm(self): module = Bmm() # noqa: F405 torch.manual_seed(8) @@ -662,13 +677,6 @@ def test_qnn_backend_bmm(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) - @unittest.skip("not applicable") - def test_qnn_backend_cast(self): - module = Cast() # noqa: F405 - sample_input = (10 * torch.rand((9, 4, 5, 3)),) - module = self.get_qdq_module(module, sample_input) - self.lower_module_and_test_output(module, sample_input) - def test_qnn_backend_cat(self): modules = [Cat2(), Cat3(), Cat4()] # noqa: F405 sample_input = (torch.randn(1, 1, 2, 2), torch.randn(1, 1, 4, 2)) @@ -1000,6 +1008,14 @@ def test_qnn_backend_reshape(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_rms_norm(self): + module = RmsNorm() # noqa: F405 + sample_input = (torch.abs(torch.randn([1, 1, 1, 4])),) + module = self.get_qdq_module( + module, sample_input, quant_dtype=QuantDtype.use_16a4w + ) + self.lower_module_and_test_output(module, sample_input) + def test_qnn_backend_rsqrt(self): module = Rsqrt() # noqa: F405 sample_input = (torch.abs(torch.randn([3, 4])),) @@ -1105,7 +1121,7 @@ def setUp(self): debug=False, saver=False, online_prepare=TestQNN.online_prepare, - tensor_dump_output_path="", + dump_intermediate_outputs=TestQNN.dump_intermediate_outputs, profile=TestQNN.enable_profile, shared_buffer=TestQNN.shared_buffer, ) @@ -1271,6 +1287,22 @@ def setUp(self): saver=False, ) + def test_qnn_backend_dump_intermediate_outputs(self): + backend_options = generate_htp_compiler_spec(use_fp16=True) + TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec( + soc_model=self.arch_table[TestQNN.model], + backend_options=backend_options, + dump_intermediate_outputs=True, + ) + module = Relu() # noqa: F405 + sample_input = (torch.randn([2, 5, 1, 3]),) + self.lower_module_and_test_output( + module, + sample_input, + expected_partitions=1, + expected_intermediate_events=3, + ) + def test_qnn_backend_skip_node_id(self): module = SimpleModel() # noqa: F405 sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) @@ -1329,16 +1361,10 @@ def test_qnn_backend_multi_contexts_composite(self): lowered_method=to_backend, ) sample_input = module.get_random_input() - edge_prog = ExirExportedProgram( + edge_prog = to_edge( torch.export.export(module, sample_input), - after_to_edge_passes=False, - ).to_edge( - EdgeCompileConfig( - _check_ir_validity=False, - _skip_dim_order=True, # TODO(T182928844): Delegate dim order op to backend. - ) ) - canonicalize_program(edge_prog.exported_program) + canonicalize_program(edge_prog.exported_program()) exec_prog = edge_prog.to_executorch() self.verify_output(module.get_reference_module(), sample_input, exec_prog) @@ -1388,6 +1414,7 @@ def test_qnn_backend_online_prepare(self): sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) self.lower_module_and_test_output(module, sample_input) + @unittest.skip("segfault happens in recent torch.export.export") def test_qnn_backend_context_direct(self): with tempfile.TemporaryDirectory() as tmp_dir: module = ContextBinaryExample() # noqa: F405 @@ -1431,7 +1458,24 @@ def setUp(self): saver=False, ) - def test_qnn_backend_skip_node_id(self): + def test_qnn_backend_dump_intermediate_outputs(self): + backend_options = generate_htp_compiler_spec(use_fp16=False) + TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec( + soc_model=self.arch_table[TestQNN.model], + backend_options=backend_options, + dump_intermediate_outputs=True, + ) + module = Relu() # noqa: F405 + sample_input = (torch.randn([2, 5, 1, 3]),) + module = self.get_qdq_module(module, sample_input) + self.lower_module_and_test_output( + module, + sample_input, + expected_partitions=1, + expected_intermediate_events=5, + ) + + def test_qnn_backend_skip_node_id_partitioner(self): module = SimpleModel() # noqa: F405 sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) module = self.get_qdq_module(module, sample_input) @@ -1442,7 +1486,43 @@ def test_qnn_backend_skip_node_id(self): skip_node_id_set={"aten_add_tensor", "aten_mean_dim"}, ) - def test_qnn_backend_skip_node_op(self): + def test_qnn_backend_skip_node_id_quantizer(self): + module = SimpleModel() # noqa: F405 + sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) + + # define partitioner + backend_options = generate_htp_compiler_spec( + use_fp16=False, + ) + compiler_specs = generate_qnn_executorch_compiler_spec( + soc_model=self.arch_table[TestQNN.model], + backend_options=backend_options, + ) + partitioner = QnnPartitioner(compiler_specs) + # define quantizer + quantizer = QnnQuantizer() + + # define calibration method + def calibrator(gm): + gm(*sample_input) + + # get partially lowererd graph module + graph_module, exported_progs = skip_annotation( + nn_module=module, + quantizer=quantizer, + partitioner=partitioner, + sample_input=sample_input, + calibration_cb=calibrator, + fp_node_id_set={"conv2d"}, + ) + self.assertEqual(len(exported_progs), 1) + # lower all graph again, the skipped operators will be left in CPU + exec_prog = to_edge( + torch.export.export(graph_module, sample_input), + ).to_executorch() + self.verify_output(module, sample_input, exec_prog) + + def test_qnn_backend_skip_node_op_partitioner(self): module = SimpleModel() # noqa: F405 sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) module = self.get_qdq_module(module, sample_input) @@ -1453,6 +1533,79 @@ def test_qnn_backend_skip_node_op(self): skip_node_op_set={"aten.add.Tensor"}, ) + def test_qnn_backend_skip_node_op_quantizer(self): + module = SimpleModel() # noqa: F405 + sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) + + # define partitioner + backend_options = generate_htp_compiler_spec( + use_fp16=False, + ) + compiler_specs = generate_qnn_executorch_compiler_spec( + soc_model=self.arch_table[TestQNN.model], + backend_options=backend_options, + ) + partitioner = QnnPartitioner(compiler_specs) + # define quantizer + quantizer = QnnQuantizer() + + # define calibration method + def calibrator(gm): + gm(*sample_input) + + # get partially lowererd graph module + graph_module, exported_progs = skip_annotation( + nn_module=module, + quantizer=quantizer, + partitioner=partitioner, + sample_input=sample_input, + calibration_cb=calibrator, + fp_node_op_set={torch.ops.aten.add.Tensor}, + ) + self.assertEqual(len(exported_progs), 2) + # lower all graph again, the skipped operators will be left in CPU + exec_prog = exec_prog = to_edge( + torch.export.export(graph_module, sample_input), + ).to_executorch() + self.verify_output(module, sample_input, exec_prog) + + def test_qnn_backend_graph_level_mixed_precision(self): + module = SimpleModel() # noqa: F405 + sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) + + # define partitioner + backend_options = generate_htp_compiler_spec( + use_fp16=False, + ) + compiler_specs = generate_qnn_executorch_compiler_spec( + soc_model=self.arch_table[TestQNN.model], + backend_options=backend_options, + ) + partitioner = QnnPartitioner(compiler_specs) + # define quantizer + quantizer = QnnQuantizer() + + # define calibration method + def calibrator(gm): + gm(*sample_input) + + # get partially lowererd graph module + graph_module, exported_progs = skip_annotation( + nn_module=module, + quantizer=quantizer, + partitioner=partitioner, + sample_input=sample_input, + calibration_cb=calibrator, + fp_node_id_set={"add", "mean"}, + fallback_to_cpu=False, + ) + self.assertEqual(len(exported_progs), 5) + # lower all graph again, the skipped operators will be delegated with fp16 + exec_prog = to_edge( + torch.export.export(graph_module, sample_input), + ).to_executorch() + self.verify_output(module, sample_input, exec_prog) + def test_qnn_backend_multi_contexts(self): module = SimpleModel() # noqa: F405 sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) @@ -1493,16 +1646,10 @@ def test_qnn_backend_multi_contexts_composite(self): quantize_method=self.get_qdq_module, ) sample_input = module.get_random_input() - edge_prog = ExirExportedProgram( + edge_prog = to_edge( torch.export.export(module, sample_input), - after_to_edge_passes=False, - ).to_edge( - EdgeCompileConfig( - _check_ir_validity=False, - _skip_dim_order=True, # TODO(T182928844): Delegate dim order op to backend. - ) ) - canonicalize_program(edge_prog.exported_program) + canonicalize_program(edge_prog.exported_program()) exec_prog = edge_prog.to_executorch() self.verify_output(module.get_reference_module(), sample_input, exec_prog) @@ -1555,6 +1702,7 @@ def test_qnn_backend_online_prepare(self): module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output(module, sample_input) + @unittest.skip("segfault happens in recent torch.export.export") def test_qnn_backend_context_direct(self): with tempfile.TemporaryDirectory() as tmp_dir: module = ContextBinaryExample() # noqa: F405 @@ -1668,6 +1816,46 @@ def test_gMLP(self): self.assertGreaterEqual(msg["top_1"], 60) self.assertGreaterEqual(msg["top_5"], 90) + def test_regnet(self): + if not self.required_envs([self.image_dataset]): + self.skipTest("missing required envs") + + weights = ["regnet_y_400mf", "regnet_x_400mf"] + cmds = [ + "python", + f"{self.executorch_root}/examples/qualcomm/oss_scripts/regnet.py", + "--dataset", + self.image_dataset, + "--artifact", + self.artifact_dir, + "--build_folder", + self.build_folder, + "--device", + self.device, + "--model", + self.model, + "--ip", + self.ip, + "--port", + str(self.port), + ] + if self.host: + cmds.extend(["--host", self.host]) + + for weight in weights: + p = subprocess.Popen( + cmds + ["--weights", weight], stdout=subprocess.DEVNULL + ) + with Listener((self.ip, self.port)) as listener: + conn = listener.accept() + p.communicate() + msg = json.loads(conn.recv()) + if "Error" in msg: + self.fail(msg["Error"]) + else: + self.assertGreaterEqual(msg["top_1"], 60) + self.assertGreaterEqual(msg["top_5"], 85) + def test_ssd300_vgg16(self): if not self.required_envs([self.pretrained_weight, self.oss_repo]): self.skipTest("missing required envs") @@ -1996,7 +2184,61 @@ def test_llama3_8b(self): self.fail(msg["Error"]) else: model_out = msg["result"] - self.assertTrue(model_out.startswith(prompt)) + expected_result = ( + "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + + prompt + + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>" + ) + self.assertTrue(model_out.startswith(expected_result)) + + def test_stable_diffusion(self): + if not self.required_envs(): + self.skipTest("missing required envs") + + prompt = "a photo of an astronaut riding a horse on mars" + cmds = [ + "python", + f"{self.executorch_root}/examples/qualcomm/qaihub_scripts/stable_diffusion/qaihub_stable_diffusion.py", + "--artifact", + self.artifact_dir, + "--build_folder", + self.build_folder, + "--device", + self.device, + "--model", + self.model, + "--text_encoder_bin", + f"{self.artifact_dir}/text_encoder.serialized.bin", + "--unet_bin", + f"{self.artifact_dir}/unet.serialized.bin", + "--vae_bin", + f"{self.artifact_dir}/vae.serialized.bin", + "--vocab_json", + f"{self.artifact_dir}/vocab.json", + "--num_time_steps", + "20", + "--ip", + self.ip, + "--port", + str(self.port), + "--prompt", + f"{prompt}", + "--fix_latents", + ] + if self.host: + cmds.extend(["--host", self.host]) + + p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) + with Listener((self.ip, self.port)) as listener: + conn = listener.accept() + p.communicate() + msg = json.loads(conn.recv()) + if "Error" in msg: + self.fail(msg["Error"]) + else: + # For the default settings and prompt, the expected results will be {PSNR: 23.258, SSIM: 0.852} + self.assertGreaterEqual(msg["PSNR"], 20) + self.assertGreaterEqual(msg["SSIM"], 0.8) class TestExampleScript(TestQNN): @@ -2324,6 +2566,7 @@ def test_stories_single_llama(self): model_out = msg["result"][0] self.assertTrue(model_out.startswith(golden_start_with)) + @unittest.skip("dynamic shape inputs appear in recent torch.export.export") def test_mobilebert(self): if not self.required_envs([self.pretrained_weight]): self.skipTest("missing required envs") @@ -2364,13 +2607,8 @@ def test_mobilebert(self): for k, v in cpu.items(): self.assertLessEqual(abs(v[0] - htp[k][0]), 2) - @unittest.skip("will be enabled after TODOs got resolved") + @unittest.skip("eagar mode fake quant works well, need further investigation") def test_ptq_mobilebert(self): - # TODO: 2 approaches to resolve accuracy issue - # 1. fallback embedding layers: - # - skip annotation in quantizer (need PR to provide helper funciton) - # - skip operators in partitioner (use existent "skip_node_op_set") - # 2. investigate different quantization configurations / mechanisms if not self.required_envs([self.pretrained_weight]): self.skipTest("missing required envs") @@ -2387,6 +2625,8 @@ def test_ptq_mobilebert(self): self.model, "--pretrained_weight", self.pretrained_weight, + "--ptq", + "16a16w", "--ip", self.ip, "--port", @@ -2513,6 +2753,7 @@ def setup_environment(): TestQNN.oss_repo = args.oss_repo TestQNN.shared_buffer = args.shared_buffer TestQNN.enable_x86_64 = args.enable_x86_64 + TestQNN.dump_intermediate_outputs = args.dump_intermediate_outputs return sys.argv[:1] + ns_args diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py index 5fd6d5ad196..7209b0a2678 100644 --- a/backends/qualcomm/tests/utils.py +++ b/backends/qualcomm/tests/utils.py @@ -27,6 +27,7 @@ QcomChipset, ) from executorch.backends.qualcomm.utils.utils import capture_program +from executorch.devtools import generate_etrecord, Inspector from executorch.examples.qualcomm.utils import ( generate_inputs, make_output_dir, @@ -39,9 +40,7 @@ from executorch.exir.lowered_backend_module import LoweredBackendModule from executorch.exir.pass_base import ExportPass from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass -from executorch.exir.program._program import ExecutorchProgram -from executorch.sdk import generate_etrecord -from executorch.sdk.inspector import Inspector +from executorch.exir.program import ExecutorchProgram, ExecutorchProgramManager from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e @@ -181,18 +180,21 @@ def _save_model_and_expected_output( return input_list, ref_outputs, pte_fname - def verify_output( + def verify_output( # noqa: C901 self, module: torch.nn.Module, sample_inputs: Tuple[torch.Tensor], executorch_prog: ExecutorchProgram | LoweredBackendModule, etrecord_path: str = "etrecord.bin", expected_profile_events: int = -1, + expected_intermediate_events: int = -1, ): with tempfile.TemporaryDirectory() as tmp_dir: buffer = ( executorch_prog.buffer - if isinstance(executorch_prog, ExecutorchProgram) + if isinstance( + executorch_prog, (ExecutorchProgram, ExecutorchProgramManager) + ) else executorch_prog.buffer() ) ( @@ -209,6 +211,7 @@ def verify_output( output_dir = f"{tmp_dir}/outputs" outputs = [] etdump_path = f"{tmp_dir}/etdump.etdp" + debug_output_path = f"{tmp_dir}/debug_output.bin" def post_process(): for i, f in enumerate(sorted(os.listdir(output_dir))): @@ -223,6 +226,16 @@ def validate_profile(): len(inspector.to_dataframe().index) == expected_profile_events ) + def validate_intermediate_tensor(): + inspector = Inspector( + etdump_path=etdump_path, debug_buffer_path=debug_output_path + ) + for event_block in inspector.event_blocks: + if event_block.name == "Execute": + self.assertTrue( + len(event_block.events) == expected_intermediate_events + ) + if self.enable_x86_64: generate_inputs(tmp_dir, "input_list.txt", [sample_inputs], input_list) make_output_dir(output_dir) @@ -275,6 +288,9 @@ def validate_profile(): # Verify the etdump if expected_profile_events != -1: validate_profile() + + if expected_intermediate_events != -1: + validate_intermediate_tensor() else: adb = SimpleADB( qnn_sdk=os.getenv("QNN_SDK_ROOT"), @@ -285,6 +301,9 @@ def validate_profile(): host_id=self.host, soc_model=self.model, error_only=self.error_only, + dump_intermediate_outputs=( + True if expected_intermediate_events != -1 else False + ), ) adb.push(inputs=[sample_inputs], input_list=input_list) adb.execute() @@ -294,12 +313,20 @@ def validate_profile(): if expected_profile_events != -1: adb.pull_etdump(etdump_path, callback=validate_profile) + if expected_intermediate_events != -1: + adb.pull_debug_output( + etdump_path, + debug_output_path, + callback=validate_intermediate_tensor, + ) + def lower_module_and_test_output( self, module: torch.nn.Module, sample_inputs: Tuple[torch.Tensor], expected_partitions: int = 1, expected_profile_events: int = -1, + expected_intermediate_events: int = -1, assert_output_equal: bool = True, skip_node_id_set: set = None, skip_node_op_set: set = None, @@ -323,7 +350,6 @@ def lower_module_and_test_output( # Therefore, won't want to pre-allocate # by memory manager in runtime. memory_planning_pass=MemoryPlanningPass( - memory_planning_algo="greedy", alloc_graph_input=not self.shared_buffer, alloc_graph_output=not self.shared_buffer, ), @@ -344,11 +370,19 @@ def lower_module_and_test_output( etrecord_path = "etrecord.bin" if self.enable_profile: generate_etrecord(etrecord_path, edge_copy, exec_prog) - # Check numerics - if assert_output_equal or expected_profile_events != -1: + if ( + assert_output_equal + or expected_profile_events != -1 + or expected_intermediate_events != -1 + ): self.verify_output( - module, sample_inputs, exec_prog, etrecord_path, expected_profile_events + module, + sample_inputs, + exec_prog, + etrecord_path, + expected_profile_events, + expected_intermediate_events, ) def get_qdq_module( diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py index 6dc0c4c3c8d..a0c0abf7295 100644 --- a/backends/qualcomm/utils/utils.py +++ b/backends/qualcomm/utils/utils.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import operator +import warnings from collections import OrderedDict from typing import Callable, Dict, List, Tuple @@ -38,7 +40,11 @@ from executorch.backends.qualcomm.passes.recompose_pixel_unshuffle import ( RecomposePixelUnshuffle, ) +from executorch.backends.qualcomm.passes.recompose_rms_norm import RecomposeRmsNorm from executorch.backends.qualcomm.passes.remove_redundancy import RemoveRedundancy +from executorch.backends.qualcomm.passes.replace_index_put_input import ( + ReplaceIndexPutInput, +) from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import ( _soc_info_table, QcomChipset, @@ -56,6 +62,7 @@ convert_to_option, ) from executorch.backends.qualcomm.utils.constants import QCOM_QNN_COMPILE_SPEC + from executorch.exir import ExirExportedProgram from executorch.exir.backend.compile_spec_schema import CompileSpec from executorch.exir.lowered_backend_module import LoweredBackendModule @@ -63,9 +70,74 @@ from torch._decomp import core_aten_decompositions as torch_core_aten_decompositions from torch.export.exported_program import ExportedProgram from torch.fx import passes +from torch.fx.passes.operator_support import OperatorSupportBase from torch.library import Library +class _AnnotationSkipper(OperatorSupportBase): + """ + Class used to partition out unwanted graph nodes. + e.g. - nodes are prevented from quantization annotation + - nodes have been grouped together as a submodule + + Attributes + ---------- + fp_node_id_set : set + a set contains nodes' name to be left in fp precision + fp_node_op_set : set + a set contains nodes' target (aten dialect) to be left in fp precision + skip_annotated_submodule : bool + flag to skip annotated submodule or not + + Methods + ------- + should_delegate(n: torch.fx.Node) + identify the residual nodes haven't be lowered with fixed-precision + should_skip(n: torch.fx.Node) + identify the nodes should be kept out with fixed-precision or not + is_node_supported(_, node: torch.fx.Node) + overridden method for graph partitioning + """ + + def __init__( + self, + fp_node_id_set: set = None, + fp_node_op_set: set = None, + skip_annotated_submodule: bool = False, + ): + self.fp_node_id_set = fp_node_id_set + self.fp_node_op_set = fp_node_op_set + self.skip_annotated_submodule = skip_annotated_submodule + + def should_delegate(self, n: torch.fx.Node): + return n.op == "call_function" and n.target != operator.getitem + + def should_skip(self, n: torch.fx.Node): + return n.name in self.fp_node_id_set or n.target in self.fp_node_op_set + + def is_node_supported(self, _, node: torch.fx.Node) -> bool: + if self.skip_annotated_submodule: + if node.op == "get_attr": + return all(self.should_delegate(user) for user in node.users) + return self.should_delegate(node) + + if any( + [ + node.op in ("placeholder", "output"), + self.should_skip(node), + # check if parameters belong to fallbacked operator + ( + node.op == "get_attr" + and all(self.should_skip(user) for user in node.users) + ), + ] + ): + print(f"[QNN Quantizer Annotation]: {node.name} | Skipped") + return False + + return True + + def qnn_capture_config(): return exir.CaptureConfig(enable_aot=True) @@ -184,8 +256,10 @@ def get_decomp_table() -> Dict[torch._ops.OperatorBase, Callable]: # The below super ops are supported by QNN remove_decompositions = [ torch.ops.aten.pixel_shuffle.default, + torch.ops.aten.pixel_unshuffle.default, torch.ops.aten.hardsigmoid.default, torch.ops.aten.hardswish.default, + torch.ops.aten._safe_softmax.default, ] for key in remove_decompositions: @@ -201,6 +275,7 @@ def _transform(edge_program: ExportedProgram) -> None: graph_module = edge_program.graph_module RemoveRedundancy()(graph_module) RecomposePixelUnshuffle()(graph_module) + RecomposeRmsNorm()(graph_module) ConvertToLinear()(graph_module) ConvertPReLU(edge_program)(graph_module) ConvertBmmToMatmul()(graph_module) @@ -211,6 +286,7 @@ def _transform(edge_program: ExportedProgram) -> None: AnnotateDecomposed(edge_program)(graph_module) FoldQDQ()(graph_module) LayoutTransform(edge_program)(graph_module) + ReplaceIndexPutInput(edge_program)(graph_module) # Since QDQ nodes are stripped, update graph signature again to validate program edge_program._graph_signature = _get_updated_graph_signature( @@ -238,6 +314,285 @@ def capture_program( return edge_ep +def _partition_graph_into_submodules(gm, subgm_tag, subgm_cb, ptn): + from torch.fx.passes.utils.fuser_utils import ( + erase_nodes, + fuse_as_graphmodule, + insert_subgm, + legalize_graph, + topo_sort, + ) + + partitions = ptn.propose_partitions() + # insert meta for each partition group + for i, partition in enumerate(partitions): + for node in partition.nodes: + node.meta[subgm_tag] = i + + for i in range(len(partitions)): + # find nodes with same group id in current graph + node_list = [ + node for node in gm.graph.nodes if node.meta.get(subgm_tag, "") == i + ] + # fuse group nodes into submodule + sorted_nodes = topo_sort(node_list) + submodule_name = f"{subgm_tag}_{i}" + subgm, orig_inputs, orig_outputs = fuse_as_graphmodule( + gm, sorted_nodes, submodule_name + ) + # insert submodule & trim group nodes + gm = insert_subgm( + gm, + subgm_cb(subgm, submodule_name), + orig_inputs, + orig_outputs, + ) + erase_nodes(gm, sorted_nodes) + legalize_graph(gm) + + gm.recompile() + return gm + + +def _canonicalize_graph_with_lowered_module(gm, subgm_tag, ptn): + from executorch.exir.backend.backend_api import to_backend + + # return lowered program for user to debug + exported_progs = [] + # partition each submodule which went through convert_pt2e + for node in gm.graph.nodes: + if node.op == "call_module" and subgm_tag in node.name: + # obtain sample inputs through meta + subgm_input = [ + torch.ones(arg.meta["val"].shape, dtype=arg.meta["val"].dtype) + for arg in node.args + ] + # program meets QNN backend requirement + sub_prog = capture_program(gm.get_submodule(node.name), tuple(subgm_input)) + # start lowering with given partitioner + exported_progs.append(to_backend(sub_prog.exported_program, ptn)) + # replace submodule with lowered module + gm.set_submodule( + node.name, + exported_progs[-1].graph_module, + ) + # if node has multiple outputs, getitems will be default generated + if all(n.target != operator.getitem for n in node.users): + with gm.graph.inserting_after(node): + getitem_node = gm.graph.call_function( + operator.getitem, + (node, 0), + ) + getitem_node.meta = node.meta + node.replace_all_uses_with( + replace_with=getitem_node, + delete_user_cb=lambda user: user.target != operator.getitem, + ) + + gm.recompile() + return gm, exported_progs + + +def skip_annotation( + nn_module: torch.nn.Module, + quantizer, + partitioner, + sample_input: Tuple[torch.Tensor, ...], + calibration_cb: Callable[[torch.fx.GraphModule], None], + fp_node_id_set: set = None, + fp_node_op_set: set = None, + fallback_to_cpu: bool = True, +): + r""" + Exclude speific operators from quantizer annotation. + Skipped operators will defaultly stay in CPU, set 'fallback_to_cpu' + to False for trying to delegate them with FP16 precision. + + e.g.: consider following graph: + bias_1 weight_1 input_1 bias_2 weight_2 input_2 + | (placeholder) | | (placeholder) | + \ | / \ | / + \ | / \ | / + \ | / \ | / + conv2d_1 conv2d_2 + (torch.ops.aten.conv2d.default) + \ / + \ / + \_______ _______/ + add_1 + (torch.ops.aten.add.default) + | + output + + If user wants to skip convolution op by names with + 'skip_node_id_set' = {"conv2d_1"} + "bias_1 / weight_1 / input_1 / input_2 / conv2d_1" + will be partitioned out and not annotated / lowered with QNN. + + [Generated graph] + bias_1 weight_1 input_1 input_2 + | (placeholder) | | + \ | / | + \ | / | + \ | / | + conv2d_1 | + \ / + \ / + \ / + lowered_module_1 + (QNN fixed precision) + | + output + + If user wants to skip convolution op by target with + 'skip_node_op_set' = {torch.ops.aten.conv2d.default} + "bias_1 / weight_1 / input_1 / conv2d_1, + bias_2 / weight_2 / input_2 / conv2d_2" + will be partitioned out and not annotated / lowered with QNN. + + [Generated graph] + bias_1 weight_1 input_1 bias_2 weight_2 input_2 + | (placeholder) | | (placeholder) | + \ | / \ | / + \ | / \ | / + \ | / \ | / + conv2d_1 conv2d_2 + (torch.ops.aten.conv2d.default) + \ / + \ / + \__ __/ + lowered_module_1 + (QNN fixed precision) + | + output + + If user wants to delegate the skipped conv2d from above graph + with 'fallback_to_cpu' = False: + + [Generated graph] + input_1 input_2 + (placeholder) (placeholder) + | | + \ / + lowered_module_2 + (QNN fp16 precision) + | + | + lowered_module_1 + (QNN fixed precision) + | + output + + Args: + nn_module (torch.nn.Module): The module to be lowered. + quantizer (QnnQuantizer): Instance of QnnQuantizer. + partitioner (QnnPartitioner): Instance of QnnPartitioner. + sample_input ((torch.Tensor, ...)): Sample input tensors for graph exporting. + calibration_cb (callable): Callback function for user-defined calibration. + fp_node_id_set ({str, ...}): Set of operator names to be left in fp precision. + fp_node_op_set ({torch.ops.aten.xxx, ...}): Set of operator targets to be left in fp precision. + fallback_to_cpu (bool): Whether to lower skipped nodes to fp16 or not. + + Returns: + exported_programs: List of programs lowered to QnnBackend (quantized graphs only). + """ + from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import ( + QnnExecuTorchHtpPrecision, + ) + from executorch.backends.qualcomm.serialization.qnn_compile_spec_serialize import ( + convert_to_option, + ) + from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e + from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner + + def prepare_subgm(subgm, subgm_name): + # prepare current submodule for quantization annotation + subgm_prepared = prepare_pt2e(subgm, quantizer) + # overwrite this attribute or name will be set to "GraphModule" + # we could not identify each submodule if action is not performed + subgm_prepared.__class__.__name__ = subgm_name + return subgm_prepared + + fp_node_id_set = fp_node_id_set if fp_node_id_set is not None else set() + fp_node_op_set = fp_node_op_set if fp_node_op_set is not None else set() + graph_module = torch.export.export(nn_module, sample_input).module() + # define node support type + capability_partitioner = CapabilityBasedPartitioner( + graph_module, + _AnnotationSkipper(fp_node_id_set, fp_node_op_set), + allows_single_node_partition=True, + ) + subgm_tag = "annotated_group" + graph_module = _partition_graph_into_submodules( + gm=graph_module, + subgm_tag=subgm_tag, + subgm_cb=prepare_subgm, + ptn=capability_partitioner, + ) + # perform calibration + calibration_cb(graph_module) + # convert sub modules which went through prepare_pt2e + for node in graph_module.graph.nodes: + if node.op == "call_module": + graph_module.set_submodule( + node.name, convert_pt2e(graph_module.get_submodule(node.name)) + ) + # canonicalize graph for lowering again + graph_module, exported_progs = _canonicalize_graph_with_lowered_module( + gm=graph_module, + subgm_tag=subgm_tag, + ptn=partitioner, + ) + + if not fallback_to_cpu: + try: + from executorch.exir.backend.partitioner import DelegationSpec + + # change HTP compiler spec for hardware to enable fp16 + qnn_option = generate_qnn_executorch_option( + partitioner.compiler_specs_snapshot + ) + compile_option = convert_to_option(qnn_option) + htp_options = compile_option.backend_options.htp_options + htp_options.precision = QnnExecuTorchHtpPrecision.kHtpFp16 + partitioner.delegation_spec = DelegationSpec( + "QnnBackend", + [ + CompileSpec( + QCOM_QNN_COMPILE_SPEC, convert_to_flatbuffer(compile_option) + ) + ], + ) + except: + print( + "Failed to change HTP compiler spec with 'use_fp16' as True," + " skipped operators will fallback to cpu," + ) + return graph_module, exported_progs + + # try lowering skipped operator into fp16 + capability_partitioner = CapabilityBasedPartitioner( + graph_module, + _AnnotationSkipper(skip_annotated_submodule=True), + allows_single_node_partition=True, + ) + subgm_tag = "skipped_group" + graph_module = _partition_graph_into_submodules( + gm=graph_module, + subgm_tag=subgm_tag, + subgm_cb=lambda subgm, _: subgm, + ptn=capability_partitioner, + ) + graph_module, exported_progs_fp = _canonicalize_graph_with_lowered_module( + gm=graph_module, + subgm_tag=subgm_tag, + ptn=partitioner, + ) + exported_progs.extend(exported_progs_fp) + + return graph_module, exported_progs + + def from_context_binary( ctx_path: str, op_name: str, soc_model: QcomChipset = QcomChipset.SM8650 ): @@ -380,7 +735,7 @@ def generate_qnn_executorch_compiler_spec( debug: bool = False, saver: bool = False, online_prepare: bool = False, - tensor_dump_output_path: str = "", + dump_intermediate_outputs: bool = False, profile: bool = False, shared_buffer: bool = False, is_from_context_binary: bool = False, @@ -402,10 +757,8 @@ def generate_qnn_executorch_compiler_spec( saver: Instead of compiling the model, run QNN Saver. Please check documents of Qualcomm AI Engine Direct SDK. This feature is usually for debugging purpose. - tensor_dump_output_path: If a path is given, Delegate would write - outputs of each OP there in runtime. In ALL cases, - we don't recommend to set this option. This option exist just - for debugging some accuracy issues. + dump_intermediate_outputs: If tensor dump is enabled, all intermediate tensors output will be dumped. + This option exists for debugging accuracy issues profile: Enable profile the performance of per operator. Note that for now only support kProfileDetailed to profile the performance of each operator with cycle unit. @@ -423,6 +776,13 @@ def generate_qnn_executorch_compiler_spec( if soc_model not in _supported_soc_models: raise ValueError(f"unknown SoC model for QNN: {soc_model}") + if profile and dump_intermediate_outputs: + warnings.warn( + "It is not recommended to turn on both profiling and dump_intermediate_outputs the same time" + ", because dump_intermediate_outputs will cause performance drop.", + stacklevel=1, + ) + qnn_executorch_options = QnnExecuTorchOptions( _soc_info_table[soc_model], backend_options ) @@ -433,12 +793,11 @@ def generate_qnn_executorch_compiler_spec( else QnnExecuTorchLogLevel.kLogLevelWarn ) + qnn_executorch_options.dump_intermediate_outputs = dump_intermediate_outputs + if saver: qnn_executorch_options.library_path = "libQnnSaver.so" - if len(tensor_dump_output_path.strip()) != 0: - qnn_executorch_options.tensor_dump_output_path = tensor_dump_output_path - if profile: qnn_executorch_options.profile_level = ( QnnExecuTorchProfileLevel.kProfileDetailed diff --git a/backends/transforms/TARGETS b/backends/transforms/TARGETS index d461eb49788..df50e45f099 100644 --- a/backends/transforms/TARGETS +++ b/backends/transforms/TARGETS @@ -88,6 +88,20 @@ runtime.python_library( ], ) +runtime.python_library( + name = "view_copy_to_squeeze_unsqueeze", + srcs = ["view_copy_to_squeeze_unsqueeze.py"], + visibility = [ + "//executorch/backends/...", + ], + deps = [ + ":utils", + "//caffe2:torch", + "//executorch/exir:pass_base", + "//executorch/exir/dialects:lib", + ], +) + runtime.python_library( name = "fuse_view_copy", srcs = ["fuse_view_copy.py"], diff --git a/backends/transforms/addmm_mm_to_linear.py b/backends/transforms/addmm_mm_to_linear.py index 7855de617b7..358cbb7ac14 100644 --- a/backends/transforms/addmm_mm_to_linear.py +++ b/backends/transforms/addmm_mm_to_linear.py @@ -130,7 +130,7 @@ def replace_addmm_mm_with_linear(graph: torch.fx.Graph) -> torch.fx.Graph: "call_function", ops.aten.linear.default, args ) node.replace_all_uses_with(linear_node) - output_val = linear_node.target( + output_val = linear_node.target( # pyre-fixme[29] args[0].meta["val"], args[1].meta["val"], args[2].meta["val"] ) else: @@ -147,7 +147,7 @@ def replace_addmm_mm_with_linear(graph: torch.fx.Graph) -> torch.fx.Graph: "call_function", ops.aten.linear.default, args ) node.replace_all_uses_with(linear_node) - output_val = linear_node.target( + output_val = linear_node.target( # pyre-fixme[29] args[0].meta["val"], args[1].meta["val"] ) linear_node.meta = node.meta diff --git a/backends/transforms/decompose_sdpa.py b/backends/transforms/decompose_sdpa.py index 6dbbf564f56..329dab96df2 100644 --- a/backends/transforms/decompose_sdpa.py +++ b/backends/transforms/decompose_sdpa.py @@ -34,7 +34,7 @@ def call( # refer to pytorch/test/test_decomp.py decomposed_module = make_fx( node.target, - decomposition_table=get_decompositions( + decomposition_table=get_decompositions( # pyre-fixme[6] [ torch.ops.aten._scaled_dot_product_flash_attention_for_cpu.default, ] diff --git a/backends/transforms/view_copy_to_squeeze_unsqueeze.py b/backends/transforms/view_copy_to_squeeze_unsqueeze.py new file mode 100644 index 00000000000..094ec6a3340 --- /dev/null +++ b/backends/transforms/view_copy_to_squeeze_unsqueeze.py @@ -0,0 +1,128 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-strict + +from typing import List, Optional, Union + +import torch + +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, PassResult + + +class ViewCopyToSqueezeUnsqueezePass(ExportPass): + """ + Replaces view_copy nodes with squeeze_copy.dims nodes if the view node reduces dims of size 1. + Replaces view_copy nodes with unsqueeze_copy.default nodes if the view node adds a dim of size 1. + """ + + def __init__(self) -> None: + super().__init__() + self.view_copy_op: torch._ops.OpOverload = exir_ops.edge.aten.view_copy.default + self.squeeze_op: torch._ops.OpOverload = exir_ops.edge.aten.squeeze_copy.dims + self.unsqueeze_op: torch._ops.OpOverload = ( + exir_ops.edge.aten.unsqueeze_copy.default + ) + + def is_node_target( + self, node: torch.fx.Node, target: torch._ops.OperatorBase + ) -> bool: + return node.op == "call_function" and node.target == target + + def find_squeeze_dims( + self, + input_shape: List[int], + view_shape: List[int], + ) -> Optional[List[int]]: + # view_shape should be a subset of input_shape + if len(input_shape) <= len(view_shape): + return None + + # check that all dims are equal except the removed dims + i = 0 + j = 0 + idx = [] + while i < len(input_shape): + if input_shape[i] != view_shape[j]: + if input_shape[i] == 1: + idx.append(i) + j -= 1 + # continue to check remaining dims are equal + else: + return None + i += 1 + j += 1 + return idx + + def find_unsqueeze_dim( + self, + input_shape: List[int], + view_shape: List[int], + ) -> Optional[int]: + # unsqueeze should increase the length of input_shape by 1 + if len(view_shape) - len(input_shape) != 1: + return None + + # check that all dims are equal except the added dim + i = 0 + j = 0 + idx = -1 + while j < len(view_shape): + if input_shape[i] != view_shape[j]: + if view_shape[j] == 1: + idx = j + i -= 1 + # continue to check remaining dims are equal + else: + return None + i += 1 + j += 1 + return idx + + def replace_view_copy_node( + self, + graph_module: torch.fx.GraphModule, + view_node: torch.fx.Node, + op: torch._ops.OpOverload, + arg: Union[List[int], int], + ) -> None: + with graph_module.graph.inserting_before(view_node): + new_node = graph_module.graph.create_node( + "call_function", + op, + (view_node.args[0], arg), + ) + new_node.meta = view_node.meta + view_node.replace_all_uses_with(new_node) + graph_module.graph.erase_node(view_node) + + def call(self, graph_module: torch.fx.GraphModule) -> PassResult: + modified = False + for node in graph_module.graph.nodes: + if self.is_node_target(node, self.view_copy_op): + input_node = node.args[0] + input_shape = input_node.meta["val"].shape + view_shape = node.args[1] + squeeze_dims = self.find_squeeze_dims(input_shape, view_shape) + if squeeze_dims: + self.replace_view_copy_node( + graph_module, node, self.squeeze_op, squeeze_dims + ) + modified = True + continue + unsqueeze_dim = self.find_unsqueeze_dim(input_shape, view_shape) + if unsqueeze_dim: + self.replace_view_copy_node( + graph_module, node, self.unsqueeze_op, unsqueeze_dim + ) + modified = True + continue + + if modified: + graph_module.recompile() + graph_module = super().call(graph_module).graph_module + return PassResult(graph_module, modified) diff --git a/backends/vulkan/cmake/ShaderLibrary.cmake b/backends/vulkan/cmake/ShaderLibrary.cmake index 49dc27056a0..b44736d20dd 100644 --- a/backends/vulkan/cmake/ShaderLibrary.cmake +++ b/backends/vulkan/cmake/ShaderLibrary.cmake @@ -50,8 +50,8 @@ function(gen_vulkan_shader_lib_cpp shaders_path) execute_process( COMMAND "${PYTHON_EXECUTABLE}" - ${EXECUTORCH_ROOT}/backends/vulkan/runtime/gen_vulkan_spv.py - --glsl-path ${shaders_path} --output-path ${VULKAN_SHADERGEN_OUT_PATH} + ${EXECUTORCH_ROOT}/backends/vulkan/runtime/gen_vulkan_spv.py --glsl-path + ${shaders_path} --output-path ${VULKAN_SHADERGEN_OUT_PATH} --glslc-path=${GLSLC_PATH} --tmp-dir-path=${VULKAN_SHADERGEN_OUT_PATH} --env ${VULKAN_GEN_ARG_ENV} RESULT_VARIABLE error_code diff --git a/backends/vulkan/docs/android_demo.md b/backends/vulkan/docs/android_demo.md index aaff7a7a727..8570859ed34 100644 --- a/backends/vulkan/docs/android_demo.md +++ b/backends/vulkan/docs/android_demo.md @@ -94,8 +94,9 @@ binary using the Android NDK toolchain. cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \ -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ -DANDROID_ABI=$ANDROID_ABI \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_VULKAN=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DPYTHON_EXECUTABLE=python \ diff --git a/backends/vulkan/partitioner/supported_ops.py b/backends/vulkan/partitioner/supported_ops.py index 08d7f96a6b9..ca7ce72caed 100644 --- a/backends/vulkan/partitioner/supported_ops.py +++ b/backends/vulkan/partitioner/supported_ops.py @@ -8,7 +8,10 @@ import operator -from executorch.backends.vulkan.passes.custom_ops_defs import grid_priors_op # noqa +from executorch.backends.vulkan.passes.custom_ops_defs import ( # noqa + conv_with_clamp_op, + grid_priors_op, +) from executorch.exir.dialects._ops import ops as exir_ops @@ -84,6 +87,7 @@ def __contains__(self, op): CONVOLUTION_OPS = [ exir_ops.edge.aten.convolution.default, + exir_ops.edge.et_vk.conv_with_clamp.default, ] REDUCTION_OPS = [ diff --git a/backends/vulkan/partitioner/vulkan_partitioner.py b/backends/vulkan/partitioner/vulkan_partitioner.py index 4d24877b631..103297bc758 100644 --- a/backends/vulkan/partitioner/vulkan_partitioner.py +++ b/backends/vulkan/partitioner/vulkan_partitioner.py @@ -38,6 +38,9 @@ torch.ops.aten.upsample_nearest2d.vec, ] +logger: logging.Logger = logging.getLogger("") +logger.setLevel(logging.INFO) + class VulkanSupportedOperators(OperatorSupportBase): _ops: OpList = enumerate_supported_ops() @@ -110,7 +113,7 @@ def is_node_supported( ) -> bool: r = self._is_node_supported(submodules, node) if not r and node.op == "call_function": - logging.info(f"Skipping node in Vulkan partitioning: {node.format_node()}") + logger.info(f"Skipping node in Vulkan partitioning: {node.format_node()}") return r def _is_node_supported( @@ -179,9 +182,9 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult: pl = len(partition_list) if pl == 0: - logging.warning("No Vulkan subgraphs can be partitioned!") + logger.warning("No Vulkan subgraphs can be partitioned!") else: - logging.info(f"Found {pl} Vulkan subgraphs to be partitioned.") + logger.info(f"Found {pl} Vulkan subgraphs to be partitioned.") tag_constant_data(exported_program) diff --git a/backends/vulkan/passes/custom_ops_defs.py b/backends/vulkan/passes/custom_ops_defs.py index 62f21bfee63..fd586b665a0 100644 --- a/backends/vulkan/passes/custom_ops_defs.py +++ b/backends/vulkan/passes/custom_ops_defs.py @@ -48,6 +48,43 @@ def conv_with_clamp_impl( conv_with_clamp_op = getattr(getattr(torch.ops, namespace), name) +def conv_with_clamp_out_impl( + input, + weight, + bias=None, + stride=1, + padding=0, + dilation=1, + transposed=False, + output_padding=0, + groups=1, + output_min=-float("inf"), + output_max=float("inf"), + out=None, +): + out = conv_with_clamp_impl( + input, + weight, + bias, + stride, + padding, + dilation, + transposed, + output_padding, + groups, + output_min, + output_max, + ) + return out + + +name = "conv_with_clamp.out" +lib.define( + f"{name}(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, Scalar? output_min, Scalar? output_max, *, Tensor(a!) out) -> Tensor(a!)" +) +lib.impl(name, conv_with_clamp_out_impl, "CompositeExplicitAutograd") + + # The dimension of x should be larger than 1 def grid_priors_impl( x, diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp index fd06841beca..7ed9469f77f 100644 --- a/backends/vulkan/runtime/VulkanBackend.cpp +++ b/backends/vulkan/runtime/VulkanBackend.cpp @@ -412,7 +412,7 @@ void maybe_resize_output( // VulkanBackend class // -class VulkanBackend final : public PyTorchBackendInterface { +class VulkanBackend final : public ::executorch::runtime::BackendInterface { public: ~VulkanBackend() override = default; diff --git a/backends/vulkan/runtime/api/api.h b/backends/vulkan/runtime/api/api.h index de77c57fb0e..0f496a4af8a 100644 --- a/backends/vulkan/runtime/api/api.h +++ b/backends/vulkan/runtime/api/api.h @@ -12,7 +12,7 @@ #include #include -#include +#include #include #include diff --git a/backends/vulkan/runtime/api/containers/StagingBuffer.h b/backends/vulkan/runtime/api/containers/StagingBuffer.h new file mode 100644 index 00000000000..6f67ae8a64a --- /dev/null +++ b/backends/vulkan/runtime/api/containers/StagingBuffer.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName + +#include + +#include + +#include + +namespace vkcompute { +namespace api { + +class StagingBuffer final { + private: + Context* context_p_; + vkapi::ScalarType dtype_; + size_t numel_; + size_t nbytes_; + vkapi::VulkanBuffer vulkan_buffer_; + + void* mapped_data_; + + public: + StagingBuffer( + Context* context_p, + const vkapi::ScalarType dtype, + const size_t numel) + : context_p_(context_p), + dtype_(dtype), + numel_(numel), + nbytes_(element_size(dtype_) * numel_), + vulkan_buffer_( + context_p_->adapter_ptr()->vma().create_staging_buffer(nbytes_)), + mapped_data_(nullptr) {} + + StagingBuffer(const StagingBuffer&) = delete; + StagingBuffer& operator=(const StagingBuffer&) = delete; + + StagingBuffer(StagingBuffer&&) = default; + StagingBuffer& operator=(StagingBuffer&&) = default; + + ~StagingBuffer() { + context_p_->register_buffer_cleanup(vulkan_buffer_); + } + + inline vkapi::ScalarType dtype() { + return dtype_; + } + + inline vkapi::VulkanBuffer& buffer() { + return vulkan_buffer_; + } + + inline void* data() { + if (!mapped_data_) { + mapped_data_ = vulkan_buffer_.allocation_info().pMappedData; + } + return mapped_data_; + } + + inline size_t numel() { + return numel_; + } + + inline size_t nbytes() { + return nbytes_; + } + + inline void copy_from(const void* src, const size_t nbytes) { + VK_CHECK_COND(nbytes <= nbytes_); + memcpy(data(), src, nbytes); + vmaFlushAllocation( + vulkan_buffer_.vma_allocator(), + vulkan_buffer_.allocation(), + 0u, + VK_WHOLE_SIZE); + } + + inline void copy_to(void* dst, const size_t nbytes) { + VK_CHECK_COND(nbytes <= nbytes_); + vmaInvalidateAllocation( + vulkan_buffer_.vma_allocator(), + vulkan_buffer_.allocation(), + 0u, + VK_WHOLE_SIZE); + memcpy(dst, data(), nbytes); + } + + inline void set_staging_zeros() { + memset(data(), 0, nbytes_); + } +}; + +} // namespace api +} // namespace vkcompute diff --git a/backends/vulkan/runtime/api/containers/StorageBuffer.h b/backends/vulkan/runtime/api/containers/StorageBuffer.h deleted file mode 100644 index 17c34706057..00000000000 --- a/backends/vulkan/runtime/api/containers/StorageBuffer.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName - -#include - -#include - -namespace vkcompute { -namespace api { - -class StorageBuffer final { - private: - Context* context_p_; - vkapi::ScalarType dtype_; - size_t numel_; - size_t nbytes_; - vkapi::VulkanBuffer vulkan_buffer_; - - public: - StorageBuffer( - Context* context_p, - const vkapi::ScalarType dtype, - const size_t numel, - const bool gpuonly = false) - : context_p_(context_p), - dtype_(dtype), - numel_(numel), - nbytes_(element_size(dtype_) * numel_), - vulkan_buffer_(context_p_->adapter_ptr()->vma().create_storage_buffer( - nbytes_, - gpuonly)) {} - - StorageBuffer(const StorageBuffer&) = delete; - StorageBuffer& operator=(const StorageBuffer&) = delete; - - StorageBuffer(StorageBuffer&&) = default; - StorageBuffer& operator=(StorageBuffer&&) = default; - - ~StorageBuffer() { - context_p_->register_buffer_cleanup(vulkan_buffer_); - } - - inline vkapi::ScalarType dtype() { - return dtype_; - } - - inline vkapi::VulkanBuffer& buffer() { - return vulkan_buffer_; - } - - inline size_t numel() { - return numel_; - } - - inline size_t nbytes() { - return nbytes_; - } -}; - -} // namespace api -} // namespace vkcompute diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp index 5e67b689735..498ea37f3be 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.cpp +++ b/backends/vulkan/runtime/api/containers/Tensor.cpp @@ -13,41 +13,24 @@ namespace vkcompute { namespace api { -/* - * Given the strides of a buffer-backed tensor, estimate the equivalent memory - * layout enum value by identifying the fastest moving dimension. - */ -utils::GPUMemoryLayout estimate_memory_layout( - const std::vector& dim_order) { - int64_t fastest_dim_whcn = dim_order.size() - 1 - dim_order.back(); - if (fastest_dim_whcn >= 0 && fastest_dim_whcn <= 3) { - return utils::GPUMemoryLayout(fastest_dim_whcn); - } - - // TODO(ssjia) find a way to gracefully recover from this case by i.e. adding - // a UNKOWN GPUMemoryLayout. This is not high priority though because we don't - // expect this to ever come up in practice. - VK_THROW("No compatible GPUMemoryLayout value"); -} - std::vector calculate_dim_order( const size_t ndim, - const utils::GPUMemoryLayout memory_layout) { + const int32_t packed_dim) { // Special case for zero dim tensors if (ndim == 0) { return {0}; } std::vector dim_order(ndim); - int64_t last_dim = - ndim - utils::to_packed_dim_nchw_offset(memory_layout); + // Explicitly convert ndim to signed to prevent underflow + int64_t last_dim = int64_t(ndim) - 1 - packed_dim; int64_t cur_dim = 0; for (int d = 0; d < ndim; ++d) { if (d == last_dim) { - cur_dim += 1; + cur_dim++; } dim_order[d] = cur_dim; - cur_dim += 1; + cur_dim++; } if (last_dim >= 0) { dim_order[ndim - 1] = last_dim; @@ -56,44 +39,6 @@ std::vector calculate_dim_order( return dim_order; } -namespace { - -struct StrideDimIndexPair { - int64_t stride; - int64_t dim_i; - - StrideDimIndexPair() : stride(0), dim_i(0) {} - - explicit StrideDimIndexPair(int64_t stride, int64_t dim_i) - : stride(stride), dim_i(dim_i) {} - - bool operator>(const StrideDimIndexPair& other) const { - // Descending order - return stride < other.stride; - } - - bool operator<(const StrideDimIndexPair& other) const { - // Descending order - return stride > other.stride; - } -}; - -} // namespace - -std::vector strides_to_dim_order(const std::vector& strides) { - std::vector stride_dim_pairs(strides.size()); - for (size_t i = 0; i < strides.size(); ++i) { - stride_dim_pairs[i] = StrideDimIndexPair(strides[i], i); - } - std::stable_sort(stride_dim_pairs.begin(), stride_dim_pairs.end()); - - std::vector dim_order(strides.size()); - for (int i = 0; i < strides.size(); ++i) { - dim_order.at(i) = stride_dim_pairs.at(i).dim_i; - } - return dim_order; -} - std::vector calculate_strides( const std::vector& sizes, const std::vector& dim_order) { @@ -118,6 +63,42 @@ std::vector calculate_strides( return strides; } +/* + * Axis mapping is somewhat analogous to strides for texture backed tensors. + * + * The axis mapping is normalized to 4 dimensions, similar to the padded sizes. + * The first 3 values of the axis mapping indicate the (X,Y,Z) image texture + * axis that corresponds to the width, height, and channels dimension of the + * tensor. Thus the axis mapping can be considered to be in WHCN dimension + * order. + * + * The last value `axis_map.at(3)` indicates the WHCN index of the tensor + * dimension along which batches will be concatenated. This dimension can be + * referred to as the "inner dimension" To determine which image texture axis is + * used for the concatenation, a double lookup will need to be performed + * (axis_map.at(axis_map.at(3))). + * + * The reason for strucuring axis mapping this way is because for the batch dim, + * two things need to be easily derived: + * + * 1. The dim idx of the inner dimension, so that the size of the inner + * dimension can be easily determined. + * 2. The texture axis used to concatenate batches + * + * By storing the dim index of the inner dimension instead of the texture axis + * it maps to, both pieces of information are readily available. + * + * The axis mapping allows for permuted views of texture-backed tensors. + */ +std::vector default_axis_map() { + // Currently, all compute shaders have an assumption that the channels dim is + // used to combine with the batch dim of a tensor. However, once dim mapping + // is integrated into the tensor indexing logic for each compute shader, we + // can be more flexible with mapping the batch dim to different texture axes + // in order to improve performance or memory footprint. + return {0, 1, 2, 2}; +} + bool dim_order_is_valid(const std::vector& dim_order) { int64_t sum = 0; for (size_t i = 0; i < dim_order.size(); ++i) { @@ -151,7 +132,7 @@ std::vector unsqueeze_strides( std::vector calculate_padded_sizes( const std::vector& sizes, - const utils::GPUMemoryLayout memory_layout) { + const int32_t packed_dim) { int64_t ndim = sizes.size(); if (ndim == 0) { ndim = 1; @@ -165,8 +146,7 @@ std::vector calculate_padded_sizes( } // Pad the packed dim to the next multiple of 4. - const int64_t dim_offset = - utils::to_packed_dim_nchw_offset(memory_layout); + const int64_t dim_offset = packed_dim + 1; const int64_t padded_dim_size = utils::val_at(-dim_offset, sizes); padded_sizes.at(ndim_up4 - dim_offset) = utils::align_up_4(padded_dim_size); @@ -175,30 +155,214 @@ std::vector calculate_padded_sizes( utils::uvec3 calculate_image_extents( const std::vector& padded_sizes, - const utils::GPUMemoryLayout memory_layout) { + const std::vector& axis_map, + const int32_t packed_dim) { VK_CHECK_COND(padded_sizes.size() == 4); + VK_CHECK_COND(axis_map.size() == 4); + + utils::uvec3 extents({1, 1, 1}); + // First three elements of axis_map indicate which (X,Y,Z) image axis the + // width, height, and channels dim of the tensor maps to. + for (int whcn_dim = 0; whcn_dim < 3; ++whcn_dim) { + const int64_t axis = axis_map.at(whcn_dim); + const int64_t dim = padded_sizes.size() - 1 - whcn_dim; + extents[axis] = utils::safe_downcast(padded_sizes.at(dim)); + } + + // axis_map[3] indicates the WHCN index of the dimension used for batch + // concatenation. Thus a double lookup is required to determine the image axis + // used for batch concatenation. + const int64_t concatted_whcn_dim = axis_map.at(3); + const int64_t batch_axis = axis_map.at(concatted_whcn_dim); + // Multiply the extents of the batch axis by the batch size. + extents[batch_axis] *= padded_sizes.at(0); + + VK_CHECK_COND(extents[axis_map.at(packed_dim)] % 4 == 0); + extents[axis_map.at(packed_dim)] /= 4; + return extents; +} + +// +// vTensorStorage +// - uint32_t N = utils::safe_downcast(padded_sizes.at(0)); - uint32_t C = utils::safe_downcast(padded_sizes.at(1)); - uint32_t H = utils::safe_downcast(padded_sizes.at(2)); - uint32_t W = utils::safe_downcast(padded_sizes.at(3)); +vkapi::VulkanImage allocate_image( + Context* const context_ptr, + utils::uvec3& image_extents, + const utils::StorageType storage_type, + const VkFormat image_format, + const bool allocate_memory) { + vkapi::Adapter* adapter_ptr = context_ptr->adapter_ptr(); - switch (memory_layout) { - case utils::kWidthPacked: - VK_CHECK_COND(W % 4 == 0); - W /= 4; + vkapi::ImageSampler::Properties sampler_props{ + VK_FILTER_NEAREST, + VK_SAMPLER_MIPMAP_MODE_NEAREST, + VK_SAMPLER_ADDRESS_MODE_REPEAT, + VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK, + }; + + VkImageType image_type = VK_IMAGE_TYPE_3D; + VkImageViewType image_view_type; + + switch (storage_type) { + case utils::kTexture3D: + image_type = VK_IMAGE_TYPE_3D; + image_view_type = VK_IMAGE_VIEW_TYPE_3D; break; - case utils::kHeightPacked: - VK_CHECK_COND(H % 4 == 0); - H /= 4; + case utils::kTexture2D: + image_type = VK_IMAGE_TYPE_2D; + image_view_type = VK_IMAGE_VIEW_TYPE_2D; break; - case utils::kChannelsPacked: - VK_CHECK_COND(C % 4 == 0); - C /= 4; + default: + // Return an empty VulkanImage by default + return vkapi::VulkanImage(); + } + + VkSampler sampler = adapter_ptr->sampler_cache().retrieve(sampler_props); + + return adapter_ptr->vma().create_image( + vkapi::create_extent3d(image_extents), + image_format, + image_type, + image_view_type, + sampler_props, + sampler, + /*allow_transfer = */ true, + /*allocate_memory = */ allocate_memory); +} + +vkapi::VulkanBuffer allocate_buffer( + Context* const context_ptr, + const int64_t numel, + const utils::StorageType storage_type, + const vkapi::ScalarType dtype, + const bool allocate_memory) { + vkapi::Adapter* adapter_ptr = context_ptr->adapter_ptr(); + + switch (storage_type) { + case utils::kBuffer: break; + default: + // Return an empty VulkanBuffer if Buffer storage is not used + return vkapi::VulkanBuffer(); + } + + return adapter_ptr->vma().create_storage_buffer( + element_size(dtype) * numel, allocate_memory); +} + +vTensorStorage::vTensorStorage( + Context* const context, + const utils::StorageType storage_type, + const std::vector& axis_map, + const int32_t packed_dim, + const std::vector& padded_sizes, + const vkapi::ScalarType dtype, + const bool allocate_memory) + : context_(context), + storage_type_{storage_type}, + image_extents_( + calculate_image_extents(padded_sizes, axis_map, packed_dim)), + buffer_length_{utils::multiply_integers(padded_sizes)}, + buffer_offset_{0}, + image_(allocate_image( + context_, + image_extents_, + storage_type_, + to_vkformat(dtype), + allocate_memory)), + buffer_(allocate_buffer( + context_, + buffer_length_, + storage_type_, + dtype, + allocate_memory)), + last_access_{} {} + +vTensorStorage::vTensorStorage( + const vTensorStorage& other, + const int64_t buffer_offset) + : context_(other.context_), + storage_type_{other.storage_type_}, + image_extents_(other.image_extents_), + buffer_length_{other.buffer_length_}, + buffer_offset_{buffer_offset}, + image_(other.image_), + buffer_(other.buffer_, buffer_offset), + last_access_{other.last_access_} {} + +vTensorStorage::~vTensorStorage() { + flush(); +} + +void vTensorStorage::flush() { + if (image_) { + context_->register_image_cleanup(image_); + } else if (buffer_) { + context_->register_buffer_cleanup(buffer_); } + last_access_ = {}; +} - return {W, H, C * N}; +void vTensorStorage::transition( + vkapi::PipelineBarrier& pipeline_barrier, + const vkapi::PipelineStageFlags cur_stage, + const vkapi::MemoryAccessFlags cur_access) { + // Get last stage access + vkapi::PipelineStageFlags prev_stage = last_access_.stage; + vkapi::MemoryAccessFlags prev_access = last_access_.access; + + const bool prev_written = (prev_access & vkapi::MemoryAccessType::WRITE) != 0; + + VkImageLayout cur_layout = VK_IMAGE_LAYOUT_UNDEFINED; + VkImageLayout new_layout = VK_IMAGE_LAYOUT_UNDEFINED; + bool layout_changed = false; + if (image_) { + cur_layout = image_.layout(); + new_layout = vkapi::vk_layout(cur_stage, cur_access); + + layout_changed = cur_layout != new_layout; + } + + if (prev_written || layout_changed) { + VkPipelineStageFlags src_stage = vkapi::vk_stage(prev_stage); + if (0u == src_stage) { + src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + } + VkPipelineStageFlags dst_stage = vkapi::vk_stage(cur_stage); + if (0u == dst_stage) { + dst_stage = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT; + } + + pipeline_barrier.stage.src |= src_stage; + pipeline_barrier.stage.dst |= dst_stage; + + if (image_) { + pipeline_barrier.images.emplace_back( + vkapi::vk_access(prev_stage, prev_access), + vkapi::vk_access(cur_stage, cur_access), + cur_layout, + new_layout, + image_); + + image_.set_layout(new_layout); + } else if (buffer_) { + pipeline_barrier.buffers.emplace_back( + vkapi::vk_access(prev_stage, prev_access), + vkapi::vk_access(cur_stage, cur_access), + buffer_); + } + } + + last_access_.stage = cur_stage; + last_access_.access = cur_access; +} + +bool vTensorStorage::is_copy_of(const vTensorStorage& other) const { + if (storage_type_ == utils::kBuffer) { + return buffer_.is_copy_of(other.buffer_); + } + return image_.is_copy_of(other.image_); } // @@ -213,26 +377,29 @@ vTensor::vTensor( const utils::GPUMemoryLayout memory_layout, const bool allocate_memory) : dtype_(dtype), - memory_layout_(memory_layout), - // Calculate tensor size metadata + // Calculate tensor metadata sizes_(sizes.begin(), sizes.end()), - dim_order_(calculate_dim_order(sizes_.size(), memory_layout_)), + packed_dim_(utils::to_packed_dim(memory_layout)), + dim_order_(calculate_dim_order(sizes_.size(), packed_dim_)), + axis_map_(default_axis_map()), strides_(calculate_strides(sizes, dim_order_)), numel_(utils::multiply_integers(sizes_)), - padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)}, + padded_sizes_{calculate_padded_sizes(sizes, packed_dim_)}, unsqueezed_strides_{unsqueeze_strides(strides_, numel_)}, padded_numel_(utils::multiply_integers(padded_sizes_)), - texture_limits_{{0, 0, 0}}, + logical_limits_{{0, 0, 0}}, // Utility Uniform Buffers that can be passed to shaders as arguments sizes_uniform_(), strides_uniform_(), numel_uniform_(), - texture_limits_uniform_(), + axis_map_uniform_(), + logical_limits_uniform_(), // Construct Tensor storage storage_( context, storage_type, - memory_layout_, + axis_map_, + packed_dim_, padded_sizes_, dtype_, allocate_memory) { @@ -240,10 +407,7 @@ vTensor::vTensor( dim_order_is_valid(dim_order_), "computed dim order is invalid"); if (storage_type != utils::kBuffer) { - texture_limits_.limits = utils::ivec3{ - utils::safe_downcast(storage_.image_extents_[0]), - utils::safe_downcast(storage_.image_extents_[1]), - utils::safe_downcast(storage_.image_extents_[2])}; + set_logical_limits(storage_.image_extents_); } if (dtype == vkapi::kHalf) { @@ -256,10 +420,11 @@ vTensor::vTensor( vTensor::vTensor(const vTensor& other) : dtype_(other.dtype_), - memory_layout_(other.memory_layout_), // Copy tensor size metadata sizes_(other.sizes_.begin(), other.sizes_.end()), + packed_dim_{other.packed_dim_}, dim_order_(other.dim_order_.begin(), other.dim_order_.end()), + axis_map_(other.axis_map_.begin(), other.axis_map_.end()), strides_(other.strides_.begin(), other.strides_.end()), numel_(other.numel_), padded_sizes_{other.padded_sizes_.begin(), other.padded_sizes_.end()}, @@ -267,12 +432,13 @@ vTensor::vTensor(const vTensor& other) other.unsqueezed_strides_.begin(), other.unsqueezed_strides_.end()}, padded_numel_(other.padded_numel_), - texture_limits_{other.texture_limits_}, + logical_limits_{other.logical_limits_}, // Empty initialize Utility Uniform Buffers sizes_uniform_(), strides_uniform_(), numel_uniform_(), - texture_limits_uniform_(), + axis_map_uniform_(), + logical_limits_uniform_(), // Copy Tensor storage storage_(other.storage_) {} @@ -282,21 +448,23 @@ vTensor::vTensor( const std::vector& dim_order, const int64_t offset_numel) : dtype_(other.dtype_), - memory_layout_(estimate_memory_layout(dim_order)), // Copy tensor size metadata sizes_(sizes.begin(), sizes.end()), + packed_dim_(other.packed_dim_), dim_order_(dim_order.begin(), dim_order.end()), + axis_map_(default_axis_map()), strides_(calculate_strides(sizes_, dim_order_)), numel_(utils::multiply_integers(sizes_)), - padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)}, + padded_sizes_{calculate_padded_sizes(sizes, packed_dim_)}, unsqueezed_strides_{unsqueeze_strides(strides_, numel_)}, padded_numel_(utils::multiply_integers(padded_sizes_)), - texture_limits_{{0, 0, 0}}, + logical_limits_(other.logical_limits_), // Empty initialize Utility Uniform Buffers sizes_uniform_(), strides_uniform_(), numel_uniform_(), - texture_limits_uniform_(), + axis_map_uniform_(), + logical_limits_uniform_(), // Copy Tensor storage storage_(other.storage_, vkapi::element_size(dtype_) * offset_numel) { VK_CHECK_COND( @@ -337,6 +505,25 @@ vkapi::VulkanBuffer& vTensor::buffer( return storage_.buffer_; } +void vTensor::set_logical_limits(const utils::uvec3& image_extents) { + logical_limits_.limits[0] = image_extents[axis_map_.at(0)]; + logical_limits_.limits[1] = image_extents[axis_map_.at(1)]; + logical_limits_.limits[2] = image_extents[axis_map_.at(2)]; +} + +utils::GPUMemoryLayout vTensor::estimate_memory_layout() const { + switch (packed_dim_) { + case WHCN::kWidthDim: + return utils::kWidthPacked; + case WHCN::kHeightDim: + return utils::kHeightPacked; + case WHCN::kChannelsDim: + return utils::kChannelsPacked; + default: + VK_THROW("Invalid packed dim"); + } +} + const vkapi::BufferBindInfo vTensor::sizes_ubo() { if (!sizes_uniform_.buffer()) { sizes_uniform_ = @@ -353,11 +540,19 @@ const vkapi::BufferBindInfo vTensor::strides_ubo() { return vkapi::BufferBindInfo(strides_uniform_.buffer()); } -const vkapi::BufferBindInfo vTensor::texture_limits_ubo() { - if (!texture_limits_uniform_.buffer()) { - texture_limits_uniform_ = ParamsBuffer(storage_.context_, texture_limits_); +const vkapi::BufferBindInfo vTensor::axis_map_ubo() { + if (!axis_map_uniform_.buffer()) { + axis_map_uniform_ = + ParamsBuffer(storage_.context_, utils::make_ivec4(axis_map_)); } - return vkapi::BufferBindInfo(texture_limits_uniform_.buffer()); + return vkapi::BufferBindInfo(axis_map_uniform_.buffer()); +} + +const vkapi::BufferBindInfo vTensor::logical_limits_ubo() { + if (!logical_limits_uniform_.buffer()) { + logical_limits_uniform_ = ParamsBuffer(storage_.context_, logical_limits_); + } + return vkapi::BufferBindInfo(logical_limits_uniform_.buffer()); } const vkapi::BufferBindInfo vTensor::numel_ubo() { @@ -380,17 +575,6 @@ size_t vTensor::staging_buffer_numel() const { return padded_numel_; } -VmaAllocationCreateInfo vTensor::get_allocation_create_info() const { - switch (storage_type()) { - case utils::kBuffer: - return storage_.buffer_.allocation_create_info(); - case utils::kTexture2D: - case utils::kTexture3D: - return storage_.image_.allocation_create_info(); - } - return {}; -} - VkMemoryRequirements vTensor::get_memory_requirements() const { switch (storage_type()) { case utils::kBuffer: @@ -414,51 +598,34 @@ void vTensor::bind_allocation(const vkapi::Allocation& allocation) { } } -void vTensor::update_metadata( - const std::vector& new_sizes, - const std::vector& new_dim_order) { - sizes_ = new_sizes; - dim_order_ = new_dim_order; +void vTensor::update_metadata() { strides_ = calculate_strides(sizes_, dim_order_); - // Only update the memory layout for buffer-backed tensors. Strides are - // meaningless for texture-backed tensors and do not impact the memory layout. - if (storage_type() == utils::kBuffer) { - memory_layout_ = estimate_memory_layout(dim_order_); - } numel_ = utils::multiply_integers(sizes_); - padded_sizes_ = calculate_padded_sizes(sizes_, memory_layout_); + padded_sizes_ = calculate_padded_sizes(sizes_, packed_dim_); unsqueezed_strides_ = unsqueeze_strides(strides_, numel_); padded_numel_ = utils::multiply_integers(padded_sizes_); - // Calculate the extents of the image texture that would have been required - // for a tensor of the new sizes. - utils::uvec3 virtual_extents = - calculate_image_extents(padded_sizes_, memory_layout_); - - // Update the texture limits to reflect the new virtual extents. - texture_limits_.limits = utils::ivec3{ - utils::safe_downcast(virtual_extents[0]), - utils::safe_downcast(virtual_extents[1]), - utils::safe_downcast(virtual_extents[2])}; + // Calculate the image extents that would have been used to allocate a texture + // withthe current sizes, and use that to set the logical limits. + set_logical_limits( + calculate_image_extents(padded_sizes_, axis_map_, packed_dim_)); if (sizes_uniform_.buffer()) { sizes_uniform_.update(utils::make_whcn_ivec4(sizes_)); } - if (texture_limits_uniform_.buffer()) { - texture_limits_uniform_.update(texture_limits_); - } if (strides_uniform_.buffer()) { strides_uniform_.update(utils::make_whcn_ivec4(unsqueezed_strides_)); } if (numel_uniform_.buffer()) { numel_uniform_.update(numel_); } -} - -void vTensor::update_size_metadata(const std::vector& new_sizes) { - // Dim order does not change on resize - update_metadata(new_sizes, dim_order_); + if (axis_map_uniform_.buffer()) { + axis_map_uniform_.update(utils::make_ivec4(axis_map_)); + } + if (logical_limits_uniform_.buffer()) { + logical_limits_uniform_.update(logical_limits_); + } } void vTensor::check_sizes(const std::vector& sizes) const { @@ -466,16 +633,20 @@ void vTensor::check_sizes(const std::vector& sizes) const { // For texture storage check that the current texture is large enough for // the new sizes of the tensor. utils::uvec3 virtual_extents = - calculate_image_extents(padded_sizes_, memory_layout_); + calculate_image_extents(padded_sizes_, axis_map_, packed_dim_); - bool valid_resize = virtual_extents[0] <= image_extents()[0]; - valid_resize = valid_resize && virtual_extents[1] <= image_extents()[1]; - valid_resize = valid_resize && virtual_extents[2] <= image_extents()[2]; + bool valid_resize = virtual_extents[0] <= storage_.image_extents_[0]; + valid_resize = + valid_resize && virtual_extents[1] <= storage_.image_extents_[1]; + valid_resize = + valid_resize && virtual_extents[2] <= storage_.image_extents_[2]; VK_CHECK_COND( valid_resize, "tensor sizes requires a larger texture than the current one."); } else { + // For buffer storage check that the current buffer is large enough for the + // new sizes of the tensor. int64_t numel = utils::multiply_integers(sizes); bool valid_resize = numel + storage_.buffer_offset_ <= storage_.buffer_length_; @@ -489,232 +660,72 @@ void vTensor::virtual_reconfigure( const std::vector& new_sizes, const std::vector& new_dim_order) { VK_CHECK_COND( - dim_order_is_valid(new_dim_order), "new dim order provided is invalid"); - check_sizes(new_sizes); - update_metadata(new_sizes, new_dim_order); -} + storage_type() == utils::kBuffer, + "virtual_reconfigure is only applicable for buffer backed tensors"); + VK_CHECK_COND(new_sizes.size() == new_dim_order.size()); + VK_CHECK_COND(dim_order_is_valid(new_dim_order)); -void vTensor::virtual_resize(const std::vector& new_sizes) { check_sizes(new_sizes); - update_size_metadata(new_sizes); -} - -void vTensor::reallocate(const std::vector& new_sizes) { - update_size_metadata(new_sizes); - storage_.discard_and_reallocate( - calculate_padded_sizes(new_sizes, memory_layout_), - memory_layout_, - dtype_); -} - -// -// vTensorStorage -// - -vkapi::VulkanImage allocate_image( - Context* const context_ptr, - utils::uvec3& image_extents, - const utils::StorageType storage_type, - const VkFormat image_format, - const bool allocate_memory) { - vkapi::Adapter* adapter_ptr = context_ptr->adapter_ptr(); - - vkapi::ImageSampler::Properties sampler_props{ - VK_FILTER_NEAREST, - VK_SAMPLER_MIPMAP_MODE_NEAREST, - VK_SAMPLER_ADDRESS_MODE_REPEAT, - VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK, - }; - - VkImageType image_type = VK_IMAGE_TYPE_3D; - VkImageViewType image_view_type; - - switch (storage_type) { - case utils::kTexture3D: - image_type = VK_IMAGE_TYPE_3D; - image_view_type = VK_IMAGE_VIEW_TYPE_3D; - break; - case utils::kTexture2D: - image_type = VK_IMAGE_TYPE_2D; - image_view_type = VK_IMAGE_VIEW_TYPE_2D; - break; - default: - // Return an empty VulkanImage by default - return vkapi::VulkanImage(); - } - - VkSampler sampler = adapter_ptr->sampler_cache().retrieve(sampler_props); - - return adapter_ptr->vma().create_image( - vkapi::create_extent3d(image_extents), - image_format, - image_type, - image_view_type, - sampler_props, - sampler, - /*allow_transfer = */ true, - /*allocate_memory = */ allocate_memory); -} - -vkapi::VulkanBuffer allocate_buffer( - Context* const context_ptr, - const int64_t numel, - const utils::StorageType storage_type, - const vkapi::ScalarType dtype, - const bool allocate_memory) { - vkapi::Adapter* adapter_ptr = context_ptr->adapter_ptr(); - - switch (storage_type) { - case utils::kBuffer: - break; - default: - // Return an empty VulkanBuffer if Buffer storage is not used - return vkapi::VulkanBuffer(); - } - - return adapter_ptr->vma().create_storage_buffer( - element_size(dtype) * numel, /*gpu_only = */ true, allocate_memory); + sizes_ = new_sizes; + dim_order_ = new_dim_order; + update_metadata(); } -vTensorStorage::vTensorStorage( - Context* const context, - const utils::StorageType storage_type, - const utils::GPUMemoryLayout gpu_memory_layout, - const std::vector& padded_sizes, - const vkapi::ScalarType dtype, - const bool allocate_memory) - : context_(context), - storage_type_{storage_type}, - image_extents_(calculate_image_extents(padded_sizes, gpu_memory_layout)), - buffer_length_{utils::multiply_integers(padded_sizes)}, - buffer_offset_{0}, - image_(allocate_image( - context_, - image_extents_, - storage_type_, - to_vkformat(dtype), - allocate_memory)), - buffer_(allocate_buffer( - context_, - buffer_length_, - storage_type_, - dtype, - allocate_memory)), - last_access_{} {} - -vTensorStorage::vTensorStorage( - const vTensorStorage& other, - const int64_t buffer_offset) - : context_(other.context_), - storage_type_{other.storage_type_}, - image_extents_(other.image_extents_), - buffer_length_{other.buffer_length_}, - buffer_offset_{buffer_offset}, - image_(), - buffer_(other.buffer_, buffer_offset), - last_access_{other.last_access_} { - if (other.storage_type_ != utils::kBuffer) { - VK_THROW("Tensors with texture storage cannot be copied!"); - } -} +void vTensor::virtual_resize(const std::vector& new_sizes) { + VK_CHECK_COND( + new_sizes.size() == dim_order_.size(), + "new sizes cannot modify the dimensionality of the tensor "); -vTensorStorage::~vTensorStorage() { - flush(); + check_sizes(new_sizes); + sizes_ = new_sizes; + update_metadata(); } -void vTensorStorage::flush() { - if (image_) { - context_->register_image_cleanup(image_); - } else if (buffer_) { - context_->register_buffer_cleanup(buffer_); +/* + * Transposing the dim order is a bit unintuitive. dim0 and dim1 have swapped + * their "identities", so we need to swap the values of dim0 and dim1 wherever + * they appear in the dim order vector. Compare this to just swapping the + * elements at dim0 and dim1 in the `sizes` vectors. + */ +void transpose_dim_order_inplace( + std::vector& dim_order, + const int64_t dim0, + const int64_t dim1) { + for (int i = 0; i < dim_order.size(); ++i) { + if (dim_order[i] == dim0) { + dim_order[i] = dim1; + } else if (dim_order[i] == dim1) { + dim_order[i] = dim0; + } } - last_access_ = {}; } -void vTensorStorage::transition( - vkapi::PipelineBarrier& pipeline_barrier, - const vkapi::PipelineStageFlags cur_stage, - const vkapi::MemoryAccessFlags cur_access) { - // Get last stage access - vkapi::PipelineStageFlags prev_stage = last_access_.stage; - vkapi::MemoryAccessFlags prev_access = last_access_.access; +void vTensor::virtual_transpose(const int64_t dim0, const int64_t dim1) { + std::iter_swap(sizes_.begin() + dim0, sizes_.begin() + dim1); - const bool prev_written = (prev_access & vkapi::MemoryAccessType::WRITE) != 0; - - VkImageLayout cur_layout = VK_IMAGE_LAYOUT_UNDEFINED; - VkImageLayout new_layout = VK_IMAGE_LAYOUT_UNDEFINED; - bool layout_changed = false; - if (image_) { - cur_layout = image_.layout(); - new_layout = vkapi::vk_layout(cur_stage, cur_access); - - layout_changed = cur_layout != new_layout; + const int dim0_whcn = sizes_.size() - 1 - dim0; + const int dim1_whcn = sizes_.size() - 1 - dim1; + if (packed_dim_ == dim0_whcn) { + packed_dim_ = dim1_whcn; + } else if (packed_dim_ == dim1_whcn) { + packed_dim_ = dim0_whcn; } - if (prev_written || layout_changed) { - VkPipelineStageFlags src_stage = vkapi::vk_stage(prev_stage); - if (0u == src_stage) { - src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; - } - VkPipelineStageFlags dst_stage = vkapi::vk_stage(cur_stage); - if (0u == dst_stage) { - dst_stage = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT; - } - - pipeline_barrier.stage.src |= src_stage; - pipeline_barrier.stage.dst |= dst_stage; - - if (image_) { - pipeline_barrier.images.emplace_back( - vkapi::vk_access(prev_stage, prev_access), - vkapi::vk_access(cur_stage, cur_access), - cur_layout, - new_layout, - image_); - - image_.set_layout(new_layout); - } else if (buffer_) { - pipeline_barrier.buffers.emplace_back( - vkapi::vk_access(prev_stage, prev_access), - vkapi::vk_access(cur_stage, cur_access), - buffer_); + if (storage_type() == utils::kBuffer) { + transpose_dim_order_inplace(dim_order_, dim0, dim1); + } else { + // Cannot transpose batch dimension for texture storage + VK_CHECK_COND(dim0_whcn < 3 && dim1_whcn < 3); + std::iter_swap( + axis_map_.begin() + dim0_whcn, axis_map_.begin() + dim1_whcn); + // Update the "identity" of the concatted dimension + if (axis_map_.at(3) == dim0_whcn) { + axis_map_.at(3) = dim1_whcn; + } else if (axis_map_.at(3) == dim1_whcn) { + axis_map_.at(3) = dim0_whcn; } } - - last_access_.stage = cur_stage; - last_access_.access = cur_access; -} - -bool vTensorStorage::is_copy_of(const vTensorStorage& other) const { - if (storage_type_ != other.storage_type_) { - return false; - } - if (storage_type_ == utils::kBuffer) { - return buffer_.is_copy_of(other.buffer_); - } - return false; -} - -void vTensorStorage::discard_and_reallocate( - const std::vector& padded_sizes, - const utils::GPUMemoryLayout gpu_memory_layout, - const vkapi::ScalarType dtype) { - const bool image_owns_memory = image_.owns_memory(); - const bool buffer_owns_memory = buffer_.owns_memory(); - - flush(); - - image_extents_ = calculate_image_extents(padded_sizes, gpu_memory_layout); - image_ = allocate_image( - context_, - image_extents_, - storage_type_, - to_vkformat(dtype), - image_owns_memory); - - buffer_length_ = utils::multiply_integers(padded_sizes); - buffer_ = allocate_buffer( - context_, buffer_length_, storage_type_, dtype, buffer_owns_memory); + update_metadata(); } } // namespace api diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h index 11747c262d8..bbc80b85831 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.h +++ b/backends/vulkan/runtime/api/containers/Tensor.h @@ -19,15 +19,6 @@ namespace vkcompute { namespace api { -/* - * Given the strides of a tensor in NCHW dimension order, calculate the dim - * order of the tensor by computing an index sort of the strides. Note that - * there is some ambiguity when multiple dimensions have the same stride; - * stable_sort is used to preserve the ordering of "outer" dimensions with - * respect to "inner" dimensions. - */ -std::vector strides_to_dim_order(const std::vector& strides); - /* * Given a GPUMemoryLayout value, produce a dim order vector that matches the * given memory layout. The produced dim order vector will be in the NCHW @@ -35,7 +26,7 @@ std::vector strides_to_dim_order(const std::vector& strides); */ std::vector calculate_dim_order( const size_t ndim, - const utils::GPUMemoryLayout memory_layout); + const int32_t packed_dim); /* * Given the sizes of a tensor and the dim order of the tensor (both in NCHW) @@ -66,15 +57,15 @@ std::vector unsqueeze_strides( */ std::vector calculate_padded_sizes( const std::vector& sizes, - const utils::GPUMemoryLayout memory_layout); + const int32_t packed_dim); /* - * Given the padded sizes of a tensor and the GPU memory layout, calculate the - * 3D image extents required to store the tensor data as an image texture. + * Calculate the image extents required of a texture backed tensor. */ utils::uvec3 calculate_image_extents( const std::vector& padded_sizes, - const utils::GPUMemoryLayout memory_layout); + const std::vector& axis_map, + const int32_t packed_dim); struct LastAccess { vkapi::PipelineStageFlags stage; @@ -98,8 +89,9 @@ class vTensorStorage final { vTensorStorage( Context* context, const utils::StorageType storage_type, - const utils::GPUMemoryLayout gpu_memory_layout, - const std::vector& sizes, + const std::vector& axis_map, + const int32_t packed_dim, + const std::vector& padded_sizes, const vkapi::ScalarType dtype, const bool allocate_memory = true); @@ -165,11 +157,6 @@ class vTensorStorage final { * Used for checking if this vTensorStorage is a copy of another instance */ bool is_copy_of(const vTensorStorage& other) const; - - void discard_and_reallocate( - const std::vector& padded_sizes, - const utils::GPUMemoryLayout gpu_memory_layout, - const vkapi::ScalarType dtype); }; class vTensor final { @@ -227,18 +214,59 @@ class vTensor final { vTensor& operator=(vTensor&& other) = default; private: - vkapi::ScalarType dtype_; - utils::GPUMemoryLayout memory_layout_; + /* + * "Core" tensor metadata. They are the minimum amount of information required + * to construct a tensor. + */ + // Whether the tensor has elements of type float, int, etc. + vkapi::ScalarType dtype_; // sizes of the tensor in NCHW dimension order std::vector sizes_; - // dim order of the tensor in NCHW dimension order + // Describes which dimension is "tightly packed" using WHCN index (i.e. 0 for + // width, 1 for height, etc.). For texture backed tensors, this describes + // which dimension is packed along a texel. For buffer backed tensors, this + // describes which dimension has a stride of 1 (i.e. is last in the dim + // order). + int32_t packed_dim_; + + /* + * "Layout" metadata. These describe with further detail how tensor data is + * laid out in memory. However, they are considered secondary to the "core" + * metadata members above because defaults can be assumed based on a given + * memory layout. When permuting the tensor without performing a copy, these + * metadata members are the ones that will be changed. All other metadata is + * derived from a combination of sizes, memory layout, and the below members. + */ + + // dim order of the tensor; dimension indices are in NCHW dimension order + // i.e. 0 is N, 1 is C, 2 is H, 3 is W for a 4D tensor. The dims with larger + // strides precede the dims with smaller strides in the dim order. The last + // dim is always the fastest moving dim with a stride of 1. std::vector dim_order_; + // Describes which axis of an image texture each dimension of the tensor maps + // to. The axis mapping allows texture based tensors to be permuted and + // transposed without modifying the underlying texture storage. For a more in + // depth explanation of axis mapping, see the `default_axis_map()` + // function. + std::vector axis_map_; + + /* + * The below can be consider "layout" metadata as well, but are derived from + * the above data members. + */ + // strides of the tensor in NCHW dimension order std::vector strides_; // Contains the number of elements in the tensor according to the canonical // sizes. size_t numel_; + + /* + * The below metadata members are derived from the above, and are typically + * to i.e. pass tensor metadata to compute shaders. + */ + // padded sizes of the tensor in NCHW dimension order. See the // calculate_padded_sizes() function for more context. Note that padded sizes // are only used for texture storage, and not for buffer storage. @@ -249,10 +277,8 @@ class vTensor final { // Contains the number of elements in the tensor according to the padded // sizes. size_t padded_numel_; - // Contains the "virtual" texture extents of the tensor. See the - // texture_limits() function for more context. Note that the texture limits - // are only relevant for texture storage, and not for buffer storage. - TextureLimits texture_limits_; + // See the comments documenting logical_limits() for more context. + TextureLimits logical_limits_; /* * Utility GPU buffers that can be passed to shaders in order to convey tensor @@ -266,7 +292,8 @@ class vTensor final { ParamsBuffer sizes_uniform_; ParamsBuffer strides_uniform_; ParamsBuffer numel_uniform_; - ParamsBuffer texture_limits_uniform_; + ParamsBuffer axis_map_uniform_; + ParamsBuffer logical_limits_uniform_; vTensorStorage storage_; @@ -313,8 +340,29 @@ class vTensor final { return storage_.storage_type_ == utils::kBuffer; } - inline const utils::uvec3& image_extents() const { - return storage_.image_extents_; + private: + void set_logical_limits(const utils::uvec3& image_extents); + + public: + /* + * The logical limits of the tensor are derived from the image extents of the + * image texture used to store the tensor, but with two key differences. + * + * First, the image extents are permuted according to the axis map. This + * makes it so that the first element of the logical limit is the limit of the + * texture axis corresponding to the width dimension of the tensor, the next + * element is the limit of the texture axis corresponding to the height + * dimension and the last element is the limit of the texture axis that + * corresponds to the channels dimension of the tensor. + * + * Second, the logical limits may use smaller extents than the actual image + * extents of the image texture. This is due to dynamic shape; if the tensor's + * `virtual_resize()` function is called, then the logical limits will reflect + * the extents that would be needed to support a tensor with the updated sizes + * instead of the original sizes. + */ + inline const utils::ivec3& logical_limits() const { + return logical_limits_.limits; } /* @@ -324,12 +372,18 @@ class vTensor final { return dtype_; } - inline utils::GPUMemoryLayout gpu_memory_layout() const { - return memory_layout_; - } + /* + * Provide a "best guess" of a memory layout that can be used to construct a + * tensor with similar layout metadata (i.e. strides, axis_map, etc.) as this + * tensor. In some scenarios, the exact layout of the tensor may not be able + * to be replicated due to calling `virtual_*()` functions after construction; + * however, this function will provide a memory layout that will produce the + * same `packed_dim_` as this tensor. + */ + utils::GPUMemoryLayout estimate_memory_layout() const; - inline int32_t packed_dim_whcn_idx() const { - return static_cast(memory_layout_); + inline int32_t packed_dim() const { + return packed_dim_; } inline const std::vector& sizes() const { @@ -348,6 +402,10 @@ class vTensor final { return dim_order_; } + inline const std::vector& axis_map() const { + return axis_map_; + } + inline const std::vector& strides() const { return strides_; } @@ -372,25 +430,22 @@ class vTensor final { const vkapi::BufferBindInfo strides_ubo(); /* - * Returns a GPU buffer containing the virtual image extents of the tensor. - * Since a tensor can be resized with the virtual_resize() function, this - * GPU buffer contains the image extents of the tensor calculated using the - * virtual_resize() function. This allows shaders to exit early if they are - * working outside the limits of the texture. - * - * This buffer should only be used to + * Returns a GPU buffer containing the texture axis mapping for each dimension + * of the tensor, in WHCN dimension order. + */ + const vkapi::BufferBindInfo axis_map_ubo(); + + /* + * Returns a GPU buffer containing the logical limits of the tensor. See the + * comments for logical_limits() for more context. */ - const vkapi::BufferBindInfo texture_limits_ubo(); + const vkapi::BufferBindInfo logical_limits_ubo(); /* * Returns the number of elements in the buffer used to store the tensor. */ const vkapi::BufferBindInfo numel_ubo(); - inline const utils::ivec3 texture_limits() const { - return texture_limits_.limits; - } - inline size_t numel() const { return numel_; } @@ -429,26 +484,10 @@ class vTensor final { private: /* - * Update the sizes, dim order, and strides metadata of the vTensor. - * - * The dim order is used as the "source of truth" for the strides and the - * strides are calculated from the dim order, therefore only the dim order is - * accepted as an argument to this function. Within the function, the new - * strides are computed from the new sizes and new dim order. - * - * Should not be used directly, reallocate() or virtual_resize() should be - * used instead. - */ - void update_metadata( - const std::vector& new_sizes, - const std::vector& new_dim_order); - - /* - * Convenience overload of update_metadata. Given the new sizes, the new - * strides will be re-calculated based on the current memory layout of the - * tensor. Update_metadata will be called with the new sizes and strides. + * Assuming sizes, dim order, or axis mapping was modified, recompute all + * derived metadata and update metadata UBO with new values. */ - void update_size_metadata(const std::vector& new_sizes); + void update_metadata(); /* * Check that tensor sizes are valid given the current storage resource's @@ -458,13 +497,15 @@ class vTensor final { public: /* - * Virtually resize and "re-stride" the tensor by modifying the size and - * stride metadata that gets used in compute shaders. This allows the shader - * to interpret the underlying resource with the updated metadata. + * Change how the tensor should be interpreted by compute shaders via updating + * the size and dim order of the tensor. The new sizes and dim order may have + * different dimensionality than the current dimensionality of the tensor. + * + * This function can only be used for buffer-backed tensors, since texture + * backed buffers cannot change dimensionality or memory layout. * - * Note that the dim order is used as the source of truth for the strides; the - * strides are computed using the new sizes and new dim order, thus only the - * dim order is accepted as an argument to this function. + * TODO(ssjia): delete this API. prefer functions such as virtual_transpose + * instead. */ void virtual_reconfigure( const std::vector& new_sizes, @@ -473,17 +514,15 @@ class vTensor final { /* * Perform a virtual resize of the vTensor by modifying the size metadata that * gets used in compute shaders. This allows the shader to treat the - * underlying resource as if it were a different size. This function is a - * convenience overload of virtual_reconfigure; new strides will be computed - * based on the new sizes that preserves the memory layout of the tensor. + * underlying resource as if it were a different size. The new sizes cannot + * modify the dimensionality of the tensor. */ void virtual_resize(const std::vector& new_sizes); /* - * Discard the underlying VkImage or VkBuffer and re-allocate based on new - * tensor sizes + * Transpose the tensor in-place by updating its metadata. */ - void reallocate(const std::vector& new_sizes); + void virtual_transpose(const int64_t dim0, const int64_t dim1); /* * Check if this vTensor instance is a view of another vTensor instance diff --git a/backends/vulkan/runtime/gen_vulkan_spv.py b/backends/vulkan/runtime/gen_vulkan_spv.py index f4ba98b31fd..6ee29d45f18 100644 --- a/backends/vulkan/runtime/gen_vulkan_spv.py +++ b/backends/vulkan/runtime/gen_vulkan_spv.py @@ -38,6 +38,10 @@ # Basic configuration settings for shaders DEFAULT_ENV: Dict[str, Any] = { "PRECISION": "highp", + # B is shorthand for "binding". This is used to automatically increment the + # layout binding index when declaring layout bindings. Note that a container + # type is used because integers are immutable in Python. + "B": [0], } # Establishes relationships between different tensor types and different GLSL types @@ -179,8 +183,14 @@ def get_access_qualifier(access_type: Optional[str]) -> str: raise AssertionError(f"Invalid access type: {access_type}") +def get_slot_val(slot: Union[int, List[int]]) -> int: + if isinstance(slot, list): + return slot[0] + return slot + + def layout_declare_buffer( - slot: int, + slot: Union[int, List[int]], access_type: str, var_name: str, dtype: str, @@ -192,15 +202,18 @@ def layout_declare_buffer( array_type = buffer_scalar_type(dtype) out_str = f""" -layout(set = 0, binding = {slot}) buffer {precision} restrict {get_access_qualifier(access_type)} {var_name}Buffer {{ +layout(set = 0, binding = {get_slot_val(slot)}) buffer {precision} restrict {get_access_qualifier(access_type)} {var_name}Buffer {{ {array_type} {var_name}[]; }}; """ + + if isinstance(slot, list): + slot[0] = slot[0] + 1 return out_str def layout_declare_image( - slot: int, + slot: Union[int, List[int]], access_type: str, var_name: str, dtype: str, @@ -209,11 +222,16 @@ def layout_declare_image( ) -> str: image_format = TYPE_MAPPINGS["IMAGE_FORMAT"][dtype] image_type = TYPE_MAPPINGS["IMAGE_T"][image_ndim][dtype] - return f"layout(set = 0, binding = {slot}, {image_format}) uniform {precision} restrict {get_access_qualifier(access_type)} {image_type} {var_name};" + + ret_str = f"layout(set = 0, binding = {get_slot_val(slot)}, {image_format}) uniform {precision} restrict {get_access_qualifier(access_type)} {image_type} {var_name};" + + if isinstance(slot, list): + slot[0] = slot[0] + 1 + return ret_str def layout_declare_sampler( - slot: int, + slot: Union[int, List[int]], access_type: str, var_name: str, dtype: str, @@ -222,11 +240,16 @@ def layout_declare_sampler( image_ndim: int = 3, ) -> str: sampler_type = TYPE_MAPPINGS["SAMPLER_T"][image_ndim][dtype] - return f"layout(set = 0, binding = {slot}) uniform {precision} {sampler_type} {var_name};" + + ret_str = f"layout(set = 0, binding = {get_slot_val(slot)}) uniform {precision} {sampler_type} {var_name};" + + if isinstance(slot, list): + slot[0] = slot[0] + 1 + return ret_str def layout_declare_tensor( - slot: int, + slot: Union[int, List[int]], access_type: str, var_name: str, dtype: str, @@ -262,7 +285,9 @@ def layout_declare_tensor( ) -def layout_declare_ubo(slot: int, *args, precision: str = "PRECISION") -> str: +def layout_declare_ubo( + slot: Union[int, List[int]], *args, precision: str = "PRECISION" +) -> str: assert len(args) % 2 == 0 var_list = list(zip(args[::2], args[1::2])) @@ -272,12 +297,14 @@ def layout_declare_ubo(slot: int, *args, precision: str = "PRECISION") -> str: ubo_name += var_name + "_" out_str = f""" -layout(set = 0, binding = {slot}) uniform {precision} restrict readonly {ubo_name}UBO {{ +layout(set = 0, binding = {get_slot_val(slot)}) uniform {precision} restrict readonly {ubo_name}UBO {{ """ for type_name, var_name in var_list: out_str += f"{type_name} {var_name};\n" out_str += "};" + if isinstance(slot, list): + slot[0] = slot[0] + 1 return out_str diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp index e014c52a3a4..64f24e3012d 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.cpp +++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp @@ -38,14 +38,81 @@ namespace vkcompute { VALUE_PTR_CLASS_IMPL(vTensorPtr, api::vTensor, Tensor) VALUE_PTR_CLASS_IMPL(TensorRefPtr, TensorRef, TensorRef) -VALUE_PTR_CLASS_IMPL(StagingPtr, api::StorageBuffer, Staging) +VALUE_PTR_CLASS_IMPL(StagingPtr, api::StagingBuffer, Staging) VALUE_PTR_CLASS_IMPL(IntListPtr, std::vector, IntList) VALUE_PTR_CLASS_IMPL(DoubleListPtr, std::vector, DoubleList) VALUE_PTR_CLASS_IMPL(BoolListPtr, std::vector, BoolList) VALUE_PTR_CLASS_IMPL(ValueListPtr, std::vector, ValueList) +VALUE_PTR_CLASS_IMPL(SymIntPtr, SymInt, SymInt) #undef VALUE_PTR_CLASS_IMPL +// +// TmpTensor +// + +TmpTensor::TmpTensor( + ComputeGraph* const graph_ptr, + const std::vector& sizes, + const vkapi::ScalarType dtype, + const utils::StorageType storage_type, + const utils::GPUMemoryLayout memory_layout) + : graph_p(graph_ptr), + sobj_idx(get_sobj_idx()), + vref(graph_p->add_tensor( + sizes, + dtype, + storage_type, + memory_layout, + sobj_idx)) {} + +TmpTensor::TmpTensor( + ComputeGraph* const graph_ptr, + const std::vector& sizes, + const vkapi::ScalarType dtype, + const utils::StorageType storage_type) + : graph_p(graph_ptr), + sobj_idx(get_sobj_idx()), + vref(graph_p->add_tensor(sizes, dtype, storage_type, sobj_idx)) {} + +TmpTensor::TmpTensor( + ComputeGraph* const graph_ptr, + const std::vector& sizes, + const vkapi::ScalarType dtype, + const utils::GPUMemoryLayout memory_layout) + : graph_p(graph_ptr), + sobj_idx(get_sobj_idx()), + vref(graph_p->add_tensor(sizes, dtype, memory_layout, sobj_idx)) {} + +TmpTensor::TmpTensor( + ComputeGraph* const graph_ptr, + const std::vector& sizes, + const vkapi::ScalarType dtype) + : graph_p(graph_ptr), + sobj_idx(get_sobj_idx()), + vref(graph_p->add_tensor(sizes, dtype, sobj_idx)) {} + +TmpTensor::~TmpTensor() { + // Lifetime of this temporary tensor is expired; return the shared object to + // the pool, as long as the sobj index is valid + if (sobj_idx >= 0) { + graph_p->tmp_shared_object_idxs_.emplace(sobj_idx); + } +} + +int64_t TmpTensor::get_sobj_idx() { + int64_t sobj_idx; + // If no available temporary shared objects, request a new one to be created + if (graph_p->tmp_shared_object_idxs_.empty()) { + sobj_idx = graph_p->shared_objects_.size(); + } else { + // Get the first available shared object idx + sobj_idx = graph_p->tmp_shared_object_idxs_.top(); + graph_p->tmp_shared_object_idxs_.pop(); + } + return sobj_idx; +} + // // ComputeGraph // @@ -146,7 +213,7 @@ std::vector ComputeGraph::dim_order_of(const ValueRef idx) const { if (val.isTensor()) { return val.toConstTensor().dim_order(); } - VK_THROW("Could not get strides of value with type ", val.type()); + VK_THROW("Could not get dim order of value with type ", val.type()); } std::vector ComputeGraph::strides_of(const ValueRef idx) const { @@ -262,7 +329,7 @@ ValueRef ComputeGraph::add_staging( const size_t numel) { ValueRef idx(static_cast(values_.size())); check_no_active_value_ptrs(); - values_.emplace_back(api::StorageBuffer(context(), dtype, numel)); + values_.emplace_back(api::StagingBuffer(context(), dtype, numel)); return idx; } @@ -287,6 +354,13 @@ ValueRef ComputeGraph::add_string(std::string&& str) { return idx; } +ValueRef ComputeGraph::add_symint(const int32_t val) { + ValueRef idx(static_cast(values_.size())); + check_no_active_value_ptrs(); + values_.emplace_back(SymInt(context(), val)); + return idx; +} + ValueRef ComputeGraph::set_input_tensor( const ValueRef idx, const bool use_staging) { @@ -326,6 +400,22 @@ ValueRef ComputeGraph::set_output_tensor( return idx; } +vkapi::BufferBindInfo ComputeGraph::get_or_create_int_param_buffer( + const ValueRef idx) { + if (values_.at(idx).isInt()) { + const int32_t val = extract_scalar(idx); + create_params_buffer(val); + } else if (values_.at(idx).isSymInt()) { + SymIntPtr symint = get_symint(idx); + return vkapi::BufferBindInfo(symint->gpu_buffer.buffer()); + } + VK_THROW("Cannot create a int param buffer for the given value"); +} + +void ComputeGraph::set_symint(const ValueRef idx, const int32_t val) { + get_symint(idx)->set(val); +} + SharedObject& ComputeGraph::get_shared_object(const int64_t idx) { if (idx >= shared_objects_.size()) { shared_objects_.resize(static_cast(idx + 1)); @@ -364,7 +454,7 @@ utils::uvec3 ComputeGraph::create_global_wg_size(const ValueRef idx) { if (is_buffer_storage(idx)) { return {uint32_t(numel_of(idx)), 1u, 1u}; } - return image_extents_of(idx); + return logical_limits_of(idx); } utils::uvec3 ComputeGraph::create_local_wg_size( @@ -403,7 +493,7 @@ void ComputeGraph::copy_into_staging( const size_t numel) { StagingPtr staging = get_staging(idx); size_t nbytes = numel * vkapi::element_size(staging->dtype()); - copy_ptr_to_staging(data, *staging, nbytes); + staging->copy_from(data, nbytes); } void ComputeGraph::copy_from_staging( @@ -412,7 +502,7 @@ void ComputeGraph::copy_from_staging( const size_t numel) { StagingPtr staging = get_staging(idx); size_t nbytes = numel * vkapi::element_size(staging->dtype()); - copy_staging_to_ptr(*staging, data, nbytes); + staging->copy_to(data, nbytes); } void ComputeGraph::prepare() { diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index b73b552067c..d61ff7e61f6 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -11,6 +11,7 @@ // @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName #include +#include #include @@ -58,14 +59,88 @@ class ComputeGraph; DECL_VALUE_PTR_CLASS(vTensorPtr, api::vTensor) DECL_VALUE_PTR_CLASS(TensorRefPtr, TensorRef) -DECL_VALUE_PTR_CLASS(StagingPtr, api::StorageBuffer) +DECL_VALUE_PTR_CLASS(StagingPtr, api::StagingBuffer) DECL_VALUE_PTR_CLASS(IntListPtr, std::vector) DECL_VALUE_PTR_CLASS(DoubleListPtr, std::vector) DECL_VALUE_PTR_CLASS(BoolListPtr, std::vector) DECL_VALUE_PTR_CLASS(ValueListPtr, std::vector) +DECL_VALUE_PTR_CLASS(SymIntPtr, SymInt); #undef DECL_VALUE_PTR_CLASS +// +// TmpTensor +// + +/* + * This struct is used to recycle the memory of temporary tensors that are + * created during the execution of a node. Upon construction, this struct will + * check the `tmp_shared_object_idxs_` of the provided `ComputeGraph` instance + * if any shared objects are available; if not, then a new one is created. A + * tensor value is then added to the `ComputeGraph` instance with the requested + * specifications. Upon destruction, the shared object index of the temporary + * tensor is returned to `tmp_shared_object_idxs_`. + * + * Note that instances of this struct can be used as if they were `ValueRef` due + * to implementation of a custom casting operator. + * + * This class should only be used to create tensors whose lifetimes exist only + * in a well defined scope (i.e. within a function). + */ +struct TmpTensor { + ComputeGraph* graph_p; + int64_t sobj_idx; + ValueRef vref; + + // + // Match all available overloads of `add_tensor` + // + + TmpTensor( + ComputeGraph* const graph_ptr, + const std::vector& sizes, + const vkapi::ScalarType dtype, + const utils::StorageType storage_type, + const utils::GPUMemoryLayout memory_layout); + + TmpTensor( + ComputeGraph* const graph_ptr, + const std::vector& sizes, + const vkapi::ScalarType dtype, + const utils::StorageType storage_type); + + TmpTensor( + ComputeGraph* const graph_ptr, + const std::vector& sizes, + const vkapi::ScalarType dtype, + const utils::GPUMemoryLayout memory_layout); + + TmpTensor( + ComputeGraph* const graph_ptr, + const std::vector& sizes, + const vkapi::ScalarType dtype); + + // No copy construction or assignment + TmpTensor(TmpTensor& other) = delete; + TmpTensor& operator=(TmpTensor& other) = delete; + + // No move construction or assignment + TmpTensor(TmpTensor&& other) = delete; + TmpTensor& operator=(TmpTensor&& other) = delete; + + // Custom cast to ValueRef + operator ValueRef() const { + return vref; + }; + + ~TmpTensor(); + + private: + // Helper function to get first available shared object index or request a new + // one to be created. + int64_t get_sobj_idx(); +}; + // // ComputeGraph // @@ -93,7 +168,12 @@ class ComputeGraph final { vkapi::DescriptorPoolConfig execute_descriptor_counts_; std::unique_ptr context_; + std::vector shared_objects_; + // This stack is used by `TmpTensor` instances to recycle shared objects + // for temporary tensors. See the comments of `TmpTensor` for more details + std::stack tmp_shared_object_idxs_; + std::vector values_; std::vector param_ubos_; @@ -154,6 +234,7 @@ class ComputeGraph final { GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(DoubleListPtr, double_list, DoubleList) GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(BoolListPtr, bool_list, BoolList) GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(ValueListPtr, value_list, ValueList) + GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(SymIntPtr, symint, SymInt); #undef GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS @@ -209,8 +290,8 @@ class ComputeGraph final { vkapi::ScalarType dtype_of(const ValueRef idx) const; - inline utils::uvec3 image_extents_of(const ValueRef idx) const { - return values_.at(idx).toConstTensor().image_extents(); + inline const utils::ivec3& logical_limits_of(const ValueRef idx) const { + return values_.at(idx).toConstTensor().logical_limits(); } inline int32_t numel_of(const ValueRef idx) const { @@ -232,12 +313,13 @@ class ComputeGraph final { .is_view_of(values_.at(base).toConstTensor()); } - inline utils::GPUMemoryLayout memory_layout_of(const ValueRef idx) const { - return values_.at(idx).toConstTensor().gpu_memory_layout(); + inline utils::GPUMemoryLayout estimate_memory_layout_of( + const ValueRef idx) const { + return values_.at(idx).toConstTensor().estimate_memory_layout(); } - inline int32_t packed_dim_whcn_idx_of(const ValueRef idx) const { - return values_.at(idx).toConstTensor().packed_dim_whcn_idx(); + inline int32_t packed_dim_of(const ValueRef idx) const { + return values_.at(idx).toConstTensor().packed_dim(); } inline vkapi::BufferBindInfo sizes_ubo(const ValueRef idx) { @@ -252,8 +334,12 @@ class ComputeGraph final { return values_.at(idx).toTensor().numel_ubo(); } - inline vkapi::BufferBindInfo texture_limits_ubo(const ValueRef idx) { - return values_.at(idx).toTensor().texture_limits_ubo(); + inline vkapi::BufferBindInfo axis_map_ubo(const ValueRef idx) { + return values_.at(idx).toTensor().axis_map_ubo(); + } + + inline vkapi::BufferBindInfo logical_limits_ubo(const ValueRef idx) { + return values_.at(idx).toTensor().logical_limits_ubo(); } // @@ -428,15 +514,28 @@ class ComputeGraph final { ValueRef add_string(std::string&& str); + ValueRef add_symint(const int32_t val); + ValueRef set_input_tensor(const ValueRef idx, const bool use_staging = true); ValueRef set_output_tensor(const ValueRef idx, const bool use_staging = true); template - const vkapi::BufferBindInfo create_params_buffer(const Block& data) { + vkapi::BufferBindInfo create_params_buffer(const Block& data) { param_ubos_.emplace_back(api::ParamsBuffer(context_.get(), data)); return vkapi::BufferBindInfo(param_ubos_.back().buffer()); } + /* + * Given a ValueRef, do the following depending on the type of the Value: + * - If it is a SymInt, return the BufferBindInfo of the ParamsBuffer object + * backing the SymInt. + * - If it is a regular Int, create a new ParamsBuffer using the integer value + * and return the BufferBindInfo of the created ParamsBuffer. + */ + vkapi::BufferBindInfo get_or_create_int_param_buffer(const ValueRef idx); + + void set_symint(const ValueRef idx, const int32_t val); + /* * Convenience function to add an input tensor along with its staging buffer */ @@ -583,6 +682,9 @@ class ComputeGraph final { friend class DoubleListPtr; friend class BoolListPtr; friend class ValueListPtr; + friend class SymIntPtr; + + friend struct TmpTensor; }; template diff --git a/backends/vulkan/runtime/graph/Logging.cpp b/backends/vulkan/runtime/graph/Logging.cpp index 2e4833bfc64..e05fa4e4876 100644 --- a/backends/vulkan/runtime/graph/Logging.cpp +++ b/backends/vulkan/runtime/graph/Logging.cpp @@ -71,8 +71,8 @@ void ComputeGraph::print_readable() { << std::setfill(' ') << std::endl; std::cout << std::setw(6) << "idx" << std::setw(10) << "type" << std::setw(20) - << "sizes" << std::setw(10) << "node_type" << std::setw(10) - << "so_idx" << std::endl; + << "sizes" << std::setw(10) << "node_type" << std::setw(15) + << "storage_bytes" << std::setw(10) << "so_idx" << std::endl; size_t value_idx = 0; for (Value& val : values_) { @@ -108,6 +108,16 @@ void ComputeGraph::print_readable() { } } + // Actual storage bytes used + std::cout << std::setw(15); + if (val.isTensor()) { + const api::vTensor& v_tensor = val.toTensor(); + auto memory_reqs = v_tensor.get_memory_requirements(); + std::cout << memory_reqs.size; + } else { + std::cout << ""; + } + std::cout << std::setw(10); if (value_ref_to_shared_object_idx.count(value_idx) > 0) { size_t shared_obj_idx = value_ref_to_shared_object_idx.at(value_idx); diff --git a/backends/vulkan/runtime/graph/containers/SharedObject.cpp b/backends/vulkan/runtime/graph/containers/SharedObject.cpp index 0d8b77a5b74..f2474da6673 100644 --- a/backends/vulkan/runtime/graph/containers/SharedObject.cpp +++ b/backends/vulkan/runtime/graph/containers/SharedObject.cpp @@ -15,10 +15,7 @@ namespace vkcompute { void SharedObject::add_user(ComputeGraph* const graph, const ValueRef idx) { vTensorPtr t = graph->get_tensor(idx); - // // Aggregate Memory Requirements - // - const VkMemoryRequirements mem_reqs = t->get_memory_requirements(); aggregate_memory_requirements.size = std::max(mem_reqs.size, aggregate_memory_requirements.size); @@ -26,27 +23,6 @@ void SharedObject::add_user(ComputeGraph* const graph, const ValueRef idx) { std::max(mem_reqs.alignment, aggregate_memory_requirements.alignment); aggregate_memory_requirements.memoryTypeBits |= mem_reqs.memoryTypeBits; - // - // Aggregate Allocation Create Info - // - - const VmaAllocationCreateInfo create_info = t->get_allocation_create_info(); - // Clear out CREATE_STRATEGY bit flags in case of conflict - VmaAllocationCreateFlags clear_mask = ~VMA_ALLOCATION_CREATE_STRATEGY_MASK; - VmaAllocationCreateFlags create_flags = create_info.flags & clear_mask; - // Use the default allocation strategy - aggregate_create_info.flags = - create_flags | vkapi::DEFAULT_ALLOCATION_STRATEGY; - - // Set the usage flag if it is currently not set - if (aggregate_create_info.usage == VMA_MEMORY_USAGE_UNKNOWN) { - aggregate_create_info.usage = create_info.usage; - } - // Otherwise check that there is no conflict regarding usage - VK_CHECK_COND(aggregate_create_info.usage == create_info.usage); - aggregate_create_info.requiredFlags |= create_info.requiredFlags; - aggregate_create_info.preferredFlags |= create_info.preferredFlags; - users.emplace_back(idx); } @@ -54,8 +30,12 @@ void SharedObject::allocate(ComputeGraph* const graph) { if (aggregate_memory_requirements.size == 0) { return; } + + VmaAllocationCreateInfo alloc_create_info = + graph->context()->adapter_ptr()->vma().gpuonly_resource_create_info(); + allocation = graph->context()->adapter_ptr()->vma().create_allocation( - aggregate_memory_requirements, aggregate_create_info); + aggregate_memory_requirements, alloc_create_info); } void SharedObject::bind_users(ComputeGraph* const graph) { diff --git a/backends/vulkan/runtime/graph/containers/SharedObject.h b/backends/vulkan/runtime/graph/containers/SharedObject.h index 37e80257f46..bd77f6f39ba 100644 --- a/backends/vulkan/runtime/graph/containers/SharedObject.h +++ b/backends/vulkan/runtime/graph/containers/SharedObject.h @@ -28,7 +28,6 @@ struct SharedObject { explicit SharedObject() = default; VkMemoryRequirements aggregate_memory_requirements; - VmaAllocationCreateInfo aggregate_create_info; std::vector users; vkapi::Allocation allocation; diff --git a/backends/vulkan/runtime/graph/containers/SymInt.cpp b/backends/vulkan/runtime/graph/containers/SymInt.cpp new file mode 100644 index 00000000000..c91db84b787 --- /dev/null +++ b/backends/vulkan/runtime/graph/containers/SymInt.cpp @@ -0,0 +1,24 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +namespace vkcompute { + +SymInt::SymInt(api::Context* context_p, const int32_t val) + : gpu_buffer(context_p, val){}; + +void SymInt::set(const int32_t val) { + gpu_buffer.update(val); +} + +void SymInt::operator=(const int32_t val) { + gpu_buffer.update(val); +} + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/containers/SymInt.h b/backends/vulkan/runtime/graph/containers/SymInt.h new file mode 100644 index 00000000000..0c9fbee5fe2 --- /dev/null +++ b/backends/vulkan/runtime/graph/containers/SymInt.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +namespace vkcompute { + +/* + * Represents a symbolic integer whose value can be variable. It is implemented + * as a thin wrapper around a `ParamsBuffer` object that holds the value of the + * integer. The `ParamsBuffer` object allows the value of the symbolic integer + * to be changed from the CPU and have those changes be visible to all shaders + * that use the symbolic integer; it also allows the value of the symbolic + * integer to be the result of a compute shader. + * + * Regular scalar types represented by `TypeTag::INT` cannot be used for + * symbolic integers because their value is assumed to be constant; therefore + * the `Value` instance holding the value of the scalar does not contain + * any reference to the GPU buffers used to pass its value into compute shaders. + * Therefore, updating the value of the scalar does not impact the value seen + * by compute shaders. + */ +struct SymInt final { + api::ParamsBuffer gpu_buffer; + + explicit SymInt(api::Context* context_p, const int32_t val); + + void set(const int32_t val); + + void operator=(const int32_t val); +}; + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/containers/Types.cpp b/backends/vulkan/runtime/graph/containers/Types.cpp index c5ffc65add1..e7a8951a552 100644 --- a/backends/vulkan/runtime/graph/containers/Types.cpp +++ b/backends/vulkan/runtime/graph/containers/Types.cpp @@ -29,6 +29,7 @@ std::ostream& operator<<(std::ostream& out, const TypeTag& tag) { PRINT_CASE(BOOLLIST) PRINT_CASE(VALUELIST) PRINT_CASE(STRING) + PRINT_CASE(SYMINT) } return out; } diff --git a/backends/vulkan/runtime/graph/containers/Types.h b/backends/vulkan/runtime/graph/containers/Types.h index 79edbd50d3a..5840d1695ee 100644 --- a/backends/vulkan/runtime/graph/containers/Types.h +++ b/backends/vulkan/runtime/graph/containers/Types.h @@ -36,6 +36,7 @@ enum class TypeTag : uint32_t { // Special Type VALUELIST, STRING, + SYMINT, }; std::ostream& operator<<(std::ostream& out, const TypeTag& tag); diff --git a/backends/vulkan/runtime/graph/containers/Value.h b/backends/vulkan/runtime/graph/containers/Value.h index ba82213c6f8..8773f0c0b04 100644 --- a/backends/vulkan/runtime/graph/containers/Value.h +++ b/backends/vulkan/runtime/graph/containers/Value.h @@ -13,6 +13,7 @@ #include #include +#include #include namespace vkcompute { @@ -28,6 +29,11 @@ inline bool is_valid(ValueRef value_ref) { struct IOValueRef { ValueRef value; ValueRef staging; + + // Custom cast to ValueRef + operator ValueRef() const { + return value; + }; }; /* @@ -53,7 +59,7 @@ struct Value final { } u; api::vTensor as_tensor; - api::StorageBuffer as_staging; + api::StagingBuffer as_staging; TensorRef as_tensorref; std::vector as_int_list; @@ -67,6 +73,8 @@ struct Value final { std::string as_string; + SymInt as_symint; + Payload() : u() {} // NOLINTNEXTLINE ~Payload(){}; @@ -108,7 +116,7 @@ struct Value final { CASE_MOVE_MOVEABLE_TYPE( TypeTag::TENSOR, api::vTensor, as_tensor, vTensor); CASE_MOVE_MOVEABLE_TYPE( - TypeTag::STAGING, api::StorageBuffer, as_staging, StorageBuffer); + TypeTag::STAGING, api::StagingBuffer, as_staging, StagingBuffer); CASE_MOVE_MOVEABLE_TYPE( TypeTag::TENSORREF, TensorRef, as_tensorref, TensorRef); // Scalar lists @@ -123,6 +131,7 @@ struct Value final { TypeTag::VALUELIST, std::vector, as_value_list, vector); CASE_MOVE_MOVEABLE_TYPE( TypeTag::STRING, std::string, as_string, basic_string); + CASE_MOVE_MOVEABLE_TYPE(TypeTag::SYMINT, SymInt, as_symint, SymInt); case TypeTag::NONE: clearToNone(); @@ -152,7 +161,7 @@ struct Value final { payload.as_tensor.~vTensor(); break; case TypeTag::STAGING: - payload.as_staging.~StorageBuffer(); + payload.as_staging.~StagingBuffer(); break; case TypeTag::TENSORREF: payload.as_tensorref.~TensorRef(); @@ -172,6 +181,9 @@ struct Value final { case TypeTag::STRING: payload.as_string.~basic_string(); break; + case TypeTag::SYMINT: + payload.as_symint.~SymInt(); + break; // Manually list out the types so that if a type here is added later and // not handled the compiler can catch it. case TypeTag::NONE: @@ -247,7 +259,7 @@ struct Value final { as_tensor); SUPPORT_TRIVIALLY_MOVEABLE_TYPE( - api::StorageBuffer, + api::StagingBuffer, Staging, TypeTag::STAGING, as_staging); @@ -288,6 +300,8 @@ struct Value final { TypeTag::STRING, as_string); + SUPPORT_TRIVIALLY_MOVEABLE_TYPE(SymInt, SymInt, TypeTag::SYMINT, as_symint); + #undef SUPPORT_TRIVIALLY_COPYABLE_TYPE #undef SUPPORT_TRIVIALLY_MOVEABLE_TYPE diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp index b77c62920dd..61b24cd409b 100644 --- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp +++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp @@ -45,24 +45,23 @@ PrepackNode::PrepackNode( graph.update_descriptor_counts(noop_shader_, /*execute = */ false); } -api::StorageBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) { +api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) { vTensorPtr packed = graph->get_tensor(packed_); // If no TensorRef is provided, create a staging buffer of zeros according to // the vkapi::vTensor metadata. if (graph->val_is_none(tref_)) { size_t numel = utils::multiply_integers(packed->sizes()); - api::StorageBuffer staging(graph->context(), packed->dtype(), numel); - size_t nbytes = numel * vkapi::element_size(packed->dtype()); - set_staging_zeros(staging, nbytes); + api::StagingBuffer staging(graph->context(), packed->dtype(), numel); + staging.set_staging_zeros(); return staging; } TensorRefPtr tref = graph->get_tref(tref_); size_t numel = utils::multiply_integers(tref->sizes); - api::StorageBuffer staging(graph->context(), tref->dtype, numel); + api::StagingBuffer staging(graph->context(), tref->dtype, numel); size_t nbytes = numel * vkapi::element_size(tref->dtype); - copy_ptr_to_staging(tref->data, staging, nbytes); + staging.copy_from(tref->data, nbytes); return staging; } @@ -70,7 +69,7 @@ void PrepackNode::encode(ComputeGraph* graph) { api::Context* const context = graph->context(); vTensorPtr packed = graph->get_tensor(packed_); - api::StorageBuffer staging = create_staging_buffer(graph); + api::StagingBuffer staging = create_staging_buffer(graph); std::unique_lock cmd_lock = context->dispatch_lock(); diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.h b/backends/vulkan/runtime/graph/ops/PrepackNode.h index c3ac8b963fd..3e713303c3d 100644 --- a/backends/vulkan/runtime/graph/ops/PrepackNode.h +++ b/backends/vulkan/runtime/graph/ops/PrepackNode.h @@ -56,7 +56,7 @@ class PrepackNode final { const vkapi::SpecVarList spec_vars_; private: - api::StorageBuffer create_staging_buffer(ComputeGraph* graph); + api::StagingBuffer create_staging_buffer(ComputeGraph* graph); }; } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/glsl/activations.h b/backends/vulkan/runtime/graph/ops/glsl/activations.h index c5ee3b20855..94c9e1274de 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/activations.h +++ b/backends/vulkan/runtime/graph/ops/glsl/activations.h @@ -18,7 +18,7 @@ float hardswish(float x) { vec4 hardswish(vec4 tex) { return vec4( - hardswish(tex.x), hardswish(tex.y), hardswish(tex.z), hardswish(tex.z)); + hardswish(tex.x), hardswish(tex.y), hardswish(tex.z), hardswish(tex.w)); } float hardshrink(float x, float lambda, float neg_lambda) { @@ -30,3 +30,15 @@ vec4 hardshrink(vec4 tex, float lambda, float neg_lambda) { (vec4(greaterThan(tex, vec4(lambda))) + vec4(lessThan(tex, vec4(neg_lambda)))); } + +float hardsigmoid(float x) { + return mix(float(x >= 0.0), x / 6 + 0.5, float(abs(x) <= 3.0)); +} + +vec4 hardsigmoid(vec4 tex) { + return vec4( + hardsigmoid(tex.x), + hardsigmoid(tex.y), + hardsigmoid(tex.z), + hardsigmoid(tex.w)); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.glsl b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.glsl deleted file mode 100644 index dbc87eb7944..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.glsl +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -$if MAT2_IS_TRANSPOSED: - #define MAT2_IS_TRANSPOSED - -#include "indexing_utils.h" -#include "matmul.h" - -layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out; -layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1; -layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2; -layout(set = 0, binding = 3) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_self; - -layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits { - ivec3 out_limits; -}; - -layout(set = 0, binding = 5) uniform PRECISION restrict InSizes { - ivec4 in_sizes; -}; - -layout(set = 0, binding = 6) uniform PRECISION restrict SelfSizes { - ivec3 self_sizes; -}; - -layout(set = 0, binding = 7) uniform PRECISION restrict AddmmParams { - float alpha; - float beta; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, out_limits))) { - return; - } - - vec4 texel = vec4(0); - - $if MAT1_PACKING == "W_packed": - $if MAT2_PACKING == "H_packed": - ivec3 mat2_pos = ivec3(pos.x * 4, 0, pos.z); - texel = matmul_naive_W_packed_H_packed( - im_mat1, - im_mat2, - pos, - in_sizes[0]); - $elif MAT2_PACKING == "W_packed": - texel = matmul_naive_W_packed_W_packed( - im_mat1, - im_mat2, - pos, - in_sizes[0]); - $else: - $raise Exception("Unsupported value for MAT2_PACKING") - $else: - $raise Exception("Unsupported value combo for MAT1_PACKING and MAT2_PACKING") - - vec4 self_texel = get_texel_W_packed( - im_self, - pos, - self_sizes.x == 1, - self_sizes.y == 1); - - texel = beta * self_texel + alpha * texel; - imageStore(im_out, pos, texel); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.glsl new file mode 100644 index 00000000000..3d9bf885df6 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.glsl @@ -0,0 +1,174 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +$if MAT2_IS_TRANSPOSED: + #define MAT2_IS_TRANSPOSED + +$if HAS_BIAS: + #define HAS_BIAS + +#include "indexing_utils.h" + +${layout_declare_tensor(B, "w", "out_tensor", DTYPE, "texture3d")} +${layout_declare_tensor(B, "r", "mat1_tensor", DTYPE, "texture3d")} +${layout_declare_tensor(B, "r", "mat2_tensor", DTYPE, "texture3d")} +$if HAS_BIAS: + ${layout_declare_tensor(B, "r", "bias_tensor", DTYPE, "texture3d")} +${layout_declare_ubo(B, "ivec4", "out_sizes")} +${layout_declare_ubo(B, "ivec3", "out_limits")} +${layout_declare_ubo(B, "ivec4", "out_axis_map")} +${layout_declare_ubo(B, "ivec4", "mat1_sizes")} +${layout_declare_ubo(B, "ivec4", "mat1_axis_map")} +${layout_declare_ubo(B, "ivec4", "mat2_sizes")} +${layout_declare_ubo(B, "ivec4", "mat2_axis_map")} +$if HAS_BIAS: + ${layout_declare_ubo(B, "ivec4", "bias_sizes")} + ${layout_declare_ubo(B, "ivec4", "bias_axis_map")} + ${layout_declare_ubo(B, "float", "alpha", "float", "beta")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +layout(constant_id = 3) const int out_packed_dim = C_DIM; +layout(constant_id = 4) const int mat1_packed_dim = W_DIM; +layout(constant_id = 5) const int mat2_packed_dim = H_DIM; +layout(constant_id = 6) const int bias_packed_dim = W_DIM; + +#ifdef HAS_BIAS +vec4 get_bias_texel_W_packed(ivec3 logical_pos) { + ivec3 bias_pos = ivec3(0); + if (bias_sizes.y == 1) { + bias_pos[bias_axis_map.y] = 0; + } else { + bias_pos[bias_axis_map.y] = logical_pos.y; + } + if (bias_sizes.x == 1) { + bias_pos[bias_axis_map.x] = 0; + vec4 bias_texel = texelFetch(bias_tensor, bias_pos, 0); + // Only the first value is valid, the rest is 0 padding + return vec4(bias_texel.x); + } else { + bias_pos[bias_axis_map.x] = logical_pos.x; + } + + return texelFetch(bias_tensor, bias_pos, 0); +} +#endif // HAS_BIAS + +vec4 matmul_naive_k_dim_packed(const ivec3 out_lpos) { + ivec3 mat1_pos; + mat1_pos[mat1_axis_map.x] = 0; + mat1_pos[mat1_axis_map.y] = out_lpos.y; + mat1_pos[mat1_axis_map.z] = out_lpos.z; +#ifdef MAT2_IS_TRANSPOSED + const int mat2_k_axis = mat2_axis_map.x; + const int mat2_row_axis = mat2_axis_map.y; +#else + const int mat2_k_axis = mat2_axis_map.y; + const int mat2_row_axis = mat2_axis_map.x; +#endif // MAT2_IS_TRANSPOSED + + vec4 texel = vec4(0); + const int K = divup4(mat1_sizes.x); + + for (int i = 0; i < K; ++i) { + const vec4 mat1_tex = texelFetch(mat1_tensor, mat1_pos, 0); + + vec4 sums; + for (int r = 0; r < 4; ++r) { + // On-demand construction of mat2_pos appears to provide the lowest + // latency. Surprisingly, this doesn't translate to mat1_pos. + ivec3 mat2_pos = ivec3(0); + mat2_pos[mat2_k_axis] = i; + mat2_pos[mat2_row_axis] = out_lpos.x * 4 + r; +#ifndef MAT2_IS_TRANSPOSED + mat2_pos[mat2_axis_map.z] = out_lpos.z; +#endif // MAT2_IS_TRANSPOSED + sums[r] = dot(mat1_tex, texelFetch(mat2_tensor, mat2_pos, 0)); + } + + texel += sums; + + mat1_pos[mat1_axis_map.x]++; + } + + return texel; +} + +vec4 matmul_naive_k_dim_packed_row_dim_packed(const ivec3 out_lpos) { + ivec3 mat1_pos; + mat1_pos[mat1_axis_map.x] = 0; + mat1_pos[mat1_axis_map.y] = out_lpos.y; + mat1_pos[mat1_axis_map.z] = out_lpos.z; + + ivec3 mat2_pos; + mat2_pos[mat2_axis_map.x] = out_lpos.x; + mat2_pos[mat2_axis_map.y] = 0; + mat2_pos[mat2_axis_map.z] = out_lpos.z; + + ivec3 mat2_pos_offset = ivec3(0); + mat2_pos_offset[mat2_axis_map.y] = 1; + + const int mat2_y_axis = mat2_axis_map.y; + + vec4 texel = vec4(0); + const int K = divup4(mat1_sizes.x); + + for (int i = 0; + i < K; + ++i, mat1_pos[mat1_axis_map.x]++, mat2_pos[mat2_axis_map.y]+=4) { + const vec4 mat1_tex = texelFetch(mat1_tensor, mat1_pos, 0); + + for (int r = 0; r < 4; ++r) { + // On-demand construction of mat2_pos appears to provide the lowest + // latency. Surprisingly, this doesn't translate to mat1_pos. + ivec3 mat2_pos = ivec3(0); + mat2_pos[mat2_axis_map.x] = out_lpos.x; + mat2_pos[mat2_axis_map.y] = 4 * i + r; + mat2_pos[mat2_axis_map.z] = out_lpos.z; + + vec4 mat1_comp_vec = vec4(mat1_tex[r]); + texel = fma(mat1_comp_vec, texelFetch(mat2_tensor, mat2_pos, 0), texel); + } + } + + return texel; +} + +void main() { + const ivec3 out_lpos = ivec3(gl_GlobalInvocationID); + if (any(greaterThanEqual(out_lpos, out_limits))) { + return; + } + + vec4 texel = vec4(0); + +#ifdef MAT2_IS_TRANSPOSED + if (mat2_packed_dim == W_DIM) { + texel = matmul_naive_k_dim_packed(out_lpos); + } else { + texel = matmul_naive_k_dim_packed_row_dim_packed(out_lpos); + } +#else + if (mat2_packed_dim == W_DIM) { + texel = matmul_naive_k_dim_packed_row_dim_packed(out_lpos); + } else { + texel = matmul_naive_k_dim_packed(out_lpos); + } +#endif // MAT2_IS_TRANSPOSED + +#ifdef HAS_BIAS + vec4 bias_texel = get_bias_texel_W_packed(out_lpos); + texel = beta * bias_texel + alpha * texel; +#endif // HAS_BIAS + + write_texel_lpos(out_tensor, out_lpos, texel, out_axis_map); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.yaml b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.yaml similarity index 61% rename from backends/vulkan/runtime/graph/ops/glsl/addmm_naive.yaml rename to backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.yaml index 48db85cb56e..33b617eed13 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.yaml @@ -4,21 +4,21 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -addmm_naive: +addmm_naive_texture3d: parameter_names_with_default_values: DTYPE: float - NDIM: 3 - MAT1_PACKING: W_packed - MAT2_PACKING: H_packed MAT2_IS_TRANSPOSED: false + HAS_BIAS: true generate_variant_forall: DTYPE: - VALUE: float - VALUE: half shader_variants: - - NAME: addmm_naive_W_packed_H_packed - - NAME: addmm_naive_W_packed_W_packed - MAT2_PACKING: W_packed - - NAME: linear_naive_W_packed_W_packed - MAT2_PACKING: W_packed + - NAME: addmm_naive_texture3d + - NAME: matmul_naive_texture3d + HAS_BIAS: false + - NAME: linear_naive_texture3d MAT2_IS_TRANSPOSED: true + - NAME: matmul_transposed_naive_texture3d + MAT2_IS_TRANSPOSED: true + HAS_BIAS: false diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl index 1698efb0b15..ad794d6db49 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl @@ -16,90 +16,219 @@ $if MAT2_IS_TRANSPOSED: $if BATCH_MODE: #define BATCH_MODE -$if TILE_ROW == "tile_row_2": - #define TILE_ROW_2 +$if HAS_BIAS: + #define HAS_BIAS #include "indexing_utils.h" -#include "matmul.h" -// addmm will have additional arguments compared to regular mm -layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out; -layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1; -layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2; -layout(set = 0, binding = 3) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_self; +${layout_declare_tensor(B, "w", "out_tensor", DTYPE, "texture3d")} +${layout_declare_tensor(B, "r", "mat1_tensor", DTYPE, "texture3d")} +${layout_declare_tensor(B, "r", "mat2_tensor", DTYPE, "texture3d")} +$if HAS_BIAS: + ${layout_declare_tensor(B, "r", "bias_tensor", DTYPE, "texture3d")} +${layout_declare_ubo(B, "ivec4", "out_sizes")} +${layout_declare_ubo(B, "ivec4", "out_axis_map")} +${layout_declare_ubo(B, "ivec4", "mat1_sizes")} +${layout_declare_ubo(B, "ivec4", "mat1_axis_map")} +${layout_declare_ubo(B, "ivec4", "mat2_sizes")} +${layout_declare_ubo(B, "ivec4", "mat2_axis_map")} +$if HAS_BIAS: + ${layout_declare_ubo(B, "ivec4", "bias_sizes")} + ${layout_declare_ubo(B, "ivec4", "bias_axis_map")} + ${layout_declare_ubo(B, "float", "alpha", "float", "beta")} -layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits { - ivec3 out_limits; -}; +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; -layout(set = 0, binding = 5) uniform PRECISION restrict OutSizes { - ivec4 out_sizes; -}; +layout(constant_id = 3) const int out_packed_dim = C_DIM; -layout(set = 0, binding = 6) uniform PRECISION restrict SelfSizes { - ivec4 self_sizes; -}; +// To convince the SPIR-V compiler to unroll the loops optimally, need this +// macro +#define FOUR 4 -layout(set = 0, binding = 7) uniform PRECISION restrict InLimits { - ivec3 in_limits; +#define TILE_ROWS ${TILE_ROWS} + +// we avoid mat4 and vec4 usage here as they compile to much less efficient +// SPIR-V +struct FloatMatrix_2d { + float data[TILE_ROWS][FOUR]; }; -layout(set = 0, binding = 8) uniform PRECISION restrict Params { - float alpha; - float beta; +struct FloatMatrix_3d { + float data[TILE_ROWS][FOUR][FOUR]; }; -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; +#ifdef BATCH_MODE + #define FloatMatrix FloatMatrix_3d +#else + #define FloatMatrix FloatMatrix_2d +#endif // BATCH_MODE + +#ifdef HAS_BIAS +// get texel from self tensor (channel_packed) in addmm +vec4 get_texel_C_packed(const ivec2 idx) { + ivec3 bias_pos = ivec3(0); + if (bias_sizes.x > 1) { + bias_pos[bias_axis_map.x] = idx.x; + } + if (bias_sizes.y > 1) { + bias_pos[bias_axis_map.y] = idx.y; + } -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); + return texelFetch(bias_tensor, bias_pos, 0); +} +#endif // HAS_BIAS + +FloatMatrix matmul_partial(const ivec4 out_idx_tl) { + FloatMatrix results; + for (int i = 0; i < TILE_ROWS; i++) { + for (int j = 0; j < FOUR; j++) { +#ifdef BATCH_MODE + for (int k = 0; k < FOUR; k++) { + results.data[i][j][k] = 0.0f; + } +#else + results.data[i][j] = 0.0f; +#endif // BATCH_MODE + } + } + vec4 mat1_tensor_partial_load[TILE_ROWS]; + vec4 mat2_tensor_partial_load[FOUR]; + +#ifdef MAT2_IS_TRANSPOSED + const int mat2_k_axis = mat2_axis_map.x; + const int mat2_row_axis = mat2_axis_map.y; +#else + const int mat2_k_axis = mat2_axis_map.y; + const int mat2_row_axis = mat2_axis_map.x; +#endif // MAT2_IS_TRANSPOSED + +#ifdef BATCH_MODE + for (int batch_idx = 0; batch_idx < FOUR; batch_idx++) { + if (out_idx_tl.z + batch_idx >= out_sizes.z) { + break; + } +#endif // BATCH_MODE + for (int k = 0; k < mat1_sizes.x; k+=4) { + const int k_div4 = k >> 2; + // read and cache (4 x TILE_ROWS) tile of mat1 + for (int r = 0; r < TILE_ROWS; r++) { + ivec3 mat1_pos = ivec3(0); + mat1_pos[mat1_axis_map.x] = k_div4; + mat1_pos[mat1_axis_map.y] = out_idx_tl.y + r; +#ifdef BATCH_MODE + mat1_pos[mat1_axis_map.z] = out_idx_tl.z + batch_idx; +#endif // BATCH_MODE + + mat1_tensor_partial_load[r] = texelFetch(mat1_tensor, mat1_pos, 0); + } - if (any(greaterThanEqual(pos, out_limits))) { - return; + // read and cache (4 x 4) tile of mat2 + for (int r = 0; r < FOUR; ++r) { + ivec3 mat2_pos = ivec3(0); + mat2_pos[mat2_k_axis] = k_div4; + mat2_pos[mat2_row_axis] = out_idx_tl.x + r; +#if defined(BATCH_MODE) && !defined(MAT2_IS_TRANSPOSED) + mat2_pos[mat2_axis_map.z] = out_idx_tl.z + batch_idx; +#endif // BATCH_MODE + + mat2_tensor_partial_load[r] = texelFetch(mat2_tensor, mat2_pos, 0); + } + + // perform partial dot products and add partial result to results + for (int out_row = 0; out_row < TILE_ROWS; out_row++) { + for (int out_col = 0; out_col < FOUR; out_col++) { +#ifdef BATCH_MODE + results.data[out_row][out_col][batch_idx] += +#else + results.data[out_row][out_col] += +#endif // BATCH_MODE + dot(mat1_tensor_partial_load[out_row], mat2_tensor_partial_load[out_col]); + } + } } +#ifdef BATCH_MODE + } +#endif // BATCH_MODE + + return results; +} - $if BATCH_MODE: - FloatMatrix_3d results = matmul_partial_3d( - im_mat1, - im_mat2, - pos, - out_sizes[2], - in_limits[0]); - $else: - FloatMatrix_2d results = matmul_partial_2d( - im_mat1, - im_mat2, - pos, - out_sizes[2], - in_limits[0]); - - for (int idx_c = 0; idx_c < TILE_ROWS; idx_c++) { - for (int idx_r = 0; idx_r < FOUR; idx_r++) { - const ivec3 out_pos = - ivec3(idx_r + FOUR * pos.x, idx_c + TILE_ROWS * pos.y, pos.z); - - vec4 self_texel = get_texel_C_packed( - im_self, - out_pos, - self_sizes.x == 1, - self_sizes.y == 1); - - // results is in transposed order w.r.t. the desired output - $if BATCH_MODE: - imageStore( - im_out, - out_pos, - vec4( - beta * self_texel.x + alpha * results.data[idx_c][idx_r][0], - beta * self_texel.x + alpha * results.data[idx_c][idx_r][1], - beta * self_texel.x + alpha * results.data[idx_c][idx_r][2], - beta * self_texel.x + alpha * results.data[idx_c][idx_r][3])); - $else: - imageStore( - im_out, - out_pos, - vec4( - beta * self_texel.x + alpha * results.data[idx_c][idx_r], 0.0, 0.0, 0.0)); +// +// Write result matrix to output (3D matmul) +// + +void write_results_C_packed(const ivec4 out_idx_tl, FloatMatrix results) { + ivec3 out_pos = tidx_to_pos( + out_idx_tl, out_sizes, out_axis_map, out_packed_dim); + + for (int tile_c = 0; + tile_c < TILE_ROWS; + tile_c++, out_pos[out_axis_map.y]++) { + out_pos[out_axis_map.x] = out_idx_tl.x; + + for (int tile_r = 0; + tile_r < FOUR; + tile_r++, out_pos[out_axis_map.x]++) { + +#ifdef HAS_BIAS + ivec2 bias_idx; + bias_idx[bias_axis_map.x] = out_pos[out_axis_map.x]; + bias_idx[bias_axis_map.y] = out_pos[out_axis_map.y]; + float bias_val = get_texel_C_packed(bias_idx).x; +#ifdef BATCH_MODE + vec4 bias_texel = vec4(bias_val); +#else + vec4 bias_texel = vec4(bias_val, 0, 0, 0); +#endif // BATCH_MODE +#endif // HAS_BIAS + +#ifdef BATCH_MODE + vec4 out_texel = vec4( + results.data[tile_c][tile_r][0], + results.data[tile_c][tile_r][1], + results.data[tile_c][tile_r][2], + results.data[tile_c][tile_r][3]); +#else + vec4 out_texel = vec4( + results.data[tile_c][tile_r], + 0.0, + 0.0, + 0.0); +#endif // BATCH_MODE + +#ifdef HAS_BIAS + imageStore(out_tensor, out_pos, beta * bias_texel + alpha * out_texel); +#else + imageStore(out_tensor, out_pos, out_texel); +#endif // HAS_BIAS } } } + +void main() { + // Each thread is responsible for calculating a (4 x TILE_ROWS x 1) tile of + // output elements. If the input matrices are 3D, then a (4 x TILE_ROWS x 4) + // tile of output elements will be computed. Note the sizes are written in + // (W x H x C) format. + const ivec3 tile_idx = ivec3(gl_GlobalInvocationID); + + // Calculate the tensor index of the top left element in the output tile + const ivec4 out_idx_topleft = ivec4( + tile_idx.x * 4, + tile_idx.y * TILE_ROWS, +#ifdef BATCH_MODE + tile_idx.z * 4, +#else + tile_idx.z, +#endif // BATCH_MODE + 0); + + // If the top left element is already out of range, then skip + if (any(greaterThanEqual(out_idx_topleft, out_sizes))) { + return; + } + + FloatMatrix results = matmul_partial(out_idx_topleft); + + write_results_C_packed(out_idx_topleft, results); +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml index b958d3b9543..c82c2003d20 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml @@ -7,24 +7,37 @@ addmm_optimized: parameter_names_with_default_values: DTYPE: float - NDIM: 3 - PACKING: C_packed MAT2_IS_TRANSPOSED: false BATCH_MODE: false - TILE_ROW: tile_row_4 + TILE_ROWS: 4 + HAS_BIAS: true generate_variant_forall: - TILE_ROW: - - VALUE: tile_row_4 - - VALUE: tile_row_2 + TILE_ROWS: + - VALUE: 4 + SUFFIX: tile_row_4 + - VALUE: 2 + SUFFIX: tile_row_2 DTYPE: - VALUE: float - VALUE: half shader_variants: - NAME: addmm_optimized + - NAME: matmul_optimized + HAS_BIAS: false - NAME: linear_optimized MAT2_IS_TRANSPOSED: true + - NAME: matmul_transposed_optimized + MAT2_IS_TRANSPOSED: true + HAS_BIAS: false - NAME: batch_addmm_optimized BATCH_MODE: true + - NAME: batch_matmul_optimized + BATCH_MODE: true + HAS_BIAS: false - NAME: batch_linear_optimized MAT2_IS_TRANSPOSED: true BATCH_MODE: true + - NAME: batch_matmul_transposed_optimized + MAT2_IS_TRANSPOSED: true + BATCH_MODE: true + HAS_BIAS: false diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl index ec7e1da296c..3103c92aea1 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl @@ -19,38 +19,43 @@ layout(std430) buffer; -${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)} -${layout_declare_tensor(2, "r", "t_other", DTYPE, STORAGE)} -${layout_declare_ubo(3, "ivec4", "out_sizes")} -${layout_declare_ubo(4, "ivec4", "in_sizes")} -${layout_declare_ubo(5, "ivec4", "other_sizes")} -${layout_declare_ubo(6, "ivec2", "broadcast_params")} -${layout_declare_ubo(7, "float", "alpha")} +${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} +${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} +${layout_declare_tensor(B, "r", "t_other", DTYPE, STORAGE)} +${layout_declare_ubo(B, "ivec4", "out_sizes")} +${layout_declare_ubo(B, "ivec4", "out_axis_map")} +${layout_declare_ubo(B, "ivec4", "in_sizes")} +${layout_declare_ubo(B, "ivec4", "in_axis_map")} +${layout_declare_ubo(B, "ivec4", "other_sizes")} +${layout_declare_ubo(B, "ivec4", "other_axis_map")} +${layout_declare_ubo(B, "ivec2", "broadcast_params")} +${layout_declare_ubo(B, "float", "alpha")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; layout(constant_id = 3) const int packed_dim = C_DIM; void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 idx = to_tensor_idx(pos, out_sizes, packed_dim); + const ivec3 lpos = ivec3(gl_GlobalInvocationID); + const ivec4 tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, packed_dim); - if (any(greaterThanEqual(idx, out_sizes))) { + if (any(greaterThanEqual(tidx, out_sizes))) { return; } - ivec4 in_idx = broadcast_indices(idx, in_sizes); - VEC4_T in_texel = VEC4_T(texelFetch( + // broadcast on logical sizes + ivec4 in_idx = broadcast_indices(tidx, in_sizes); + VEC4_T in_texel = VEC4_T(load_texel( t_in, - to_texture_pos(in_idx, in_sizes, packed_dim), - 0)); + // read axis mapped texel + tidx_to_pos(in_idx, in_sizes, in_axis_map, packed_dim))); - ivec4 other_idx = broadcast_indices(idx, other_sizes); - VEC4_T other_texel = VEC4_T(texelFetch( + // broadcast on logical sizes + ivec4 other_idx = broadcast_indices(tidx, other_sizes); + VEC4_T other_texel = VEC4_T(load_texel( t_other, - to_texture_pos(other_idx, other_sizes, packed_dim), - 0)); + // read axis mapped texel + tidx_to_pos(other_idx, other_sizes, other_axis_map, packed_dim))); // Check boolean broadcast flags; we use ivec2 instead of bvec2 for alignment. if (broadcast_params.x > 0) { @@ -60,5 +65,9 @@ void main() { other_texel = other_texel.xxxx; } - imageStore(t_out, pos, VEC4_T(op(in_texel, other_texel, alpha))); + write_texel_lpos( + t_out, + lpos, + VEC4_T(op(in_texel, other_texel, alpha)), + out_axis_map); } diff --git a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl index 58796879e85..201b4d17262 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl @@ -23,13 +23,13 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; layout(constant_id = 3) const int UNUSED_packed_dim = W_DIM; void main() { - int out_id = int(gl_GlobalInvocationID.x); - if (out_id >= numel) { + int nchwi = int(gl_GlobalInvocationID.x); + if (nchwi >= numel) { return; } - ivec4 t_in_idx = from_nchw_buffer_i(out_id, in_sizes); - const int in_id = to_buffer_id(t_in_idx, in_strides); + ivec4 in_tidx = nchwi_to_tidx(nchwi, in_sizes); + const int in_bufi = tidx_to_bufi(in_tidx, in_strides); - nchw_buf[out_id] = t_in[in_id]; + nchw_buf[nchwi] = t_in[in_bufi]; } diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl index 35f8e25fc25..fe6d7ba7a96 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl @@ -18,32 +18,22 @@ layout(std430) buffer; -layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out; -layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in; -layout(set = 0, binding = 2) uniform PRECISION sampler3D kernel_in; -layout(set = 0, binding = 3) uniform PRECISION sampler3D bias_in; - -layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits { - ivec3 out_limits; -}; - -layout(set = 0, binding = 5) uniform PRECISION restrict InSizes { - ivec4 in_sizes; -}; - -layout(set = 0, binding = 6) uniform PRECISION restrict Params { - int kernel_size; - int stride; - int padding; - int dilation; - int in_group_size; - int out_group_size; -}; - -layout(set = 0, binding = 7) uniform PRECISION restrict OutputParams { - float out_min; - float out_max; -}; +${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} +${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} +${layout_declare_tensor(B, "r", "kernel_in", DTYPE, STORAGE)} +${layout_declare_tensor(B, "r", "bias_in", DTYPE, STORAGE)} + +${layout_declare_ubo(B, "ivec3", "out_limits")} +${layout_declare_ubo(B, "ivec4", "in_sizes")} + +${layout_declare_ubo(B, "ivec4", "out_axis_map")} +${layout_declare_ubo(B, "ivec4", "in_axis_map")} +${layout_declare_ubo(B, "ivec4", "kernel_axis_map")} +${layout_declare_ubo(B, "ivec4", "bias_axis_map")} + +${layout_declare_ubo(B,"int", "kernel_size", "int", "stride", "int", "padding", "int", "dilation", "int", "in_group_size", "int", "out_group_size")} + +${layout_declare_ubo(B, "float", "out_min", "float", "out_max")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; @@ -67,9 +57,9 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; // shader invocations, where each invocation computes 1 result. But that // performs worse. void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); + const ivec3 lpos = ivec3(gl_GlobalInvocationID); - if (any(greaterThanEqual(pos, out_limits))) { + if (any(greaterThanEqual(lpos, out_limits))) { return; } @@ -78,8 +68,8 @@ void main() { // "out_c" is the output's channel index where we write our result. // Across shader invocations, this is the only value that varies. - int out_c = pos.y; - vec4 bias = texelFetch(bias_in, ivec3(out_c, 0, 0), 0); + int out_c = lpos.y; + VEC4_T bias = load_texel_lpos(bias_in, ivec3(out_c, 0, 0), bias_axis_map); // "in_c" tracks the input's channel start index. // We iterate over the input group that corresponds to the output group. @@ -98,7 +88,7 @@ void main() { int out_l = 0; for (int in_l = l_start; in_l < l_end; in_l += stride, ++out_l) { - vec4 sum = vec4(0); + VEC4_T sum = VEC4_T(0); for (int in_c = c_start; in_c < c_end; ++in_c) { // "k" tracks the kernel's index for our input-kernel computation. @@ -107,25 +97,25 @@ void main() { for (int k = 0; k < kernel_size; k += 4) { // Since the weight tensor is width-packed, which is along the length // dimension, we can batch-read four elements at a time. - const ivec3 w_pos = ivec3(k / 4, in_c % in_group_size, out_c); - const vec4 weight = texelFetch(kernel_in, w_pos, 0); + const ivec3 w_lpos = ivec3(k / 4, in_c % in_group_size, out_c); + const VEC4_T weight = load_texel_lpos(kernel_in, w_lpos, kernel_axis_map); - const ivec3 in_pos_0 = ivec3(in_l + k * dilation, in_c, n / 4); - sum = fma(weight.xxxx, texelFetch(image_in, in_pos_0, 0), sum); + ivec3 in_pos = lpos_to_pos(ivec3(in_l + k * dilation, in_c, n / 4), in_axis_map); + sum = fma(weight.xxxx, load_texel(t_in, in_pos), sum); - const ivec3 in_pos_1 = ivec3(in_l + (k+1) * dilation, in_c, n / 4); - sum = fma(weight.yyyy, texelFetch(image_in, in_pos_1, 0), sum); + in_pos[in_axis_map.x] += dilation; + sum = fma(weight.yyyy, load_texel(t_in, in_pos), sum); - const ivec3 in_pos_2 = ivec3(in_l + (k+2) * dilation, in_c, n / 4); - sum = fma(weight.zzzz, texelFetch(image_in, in_pos_2, 0), sum); + in_pos[in_axis_map.x] += dilation; + sum = fma(weight.zzzz, load_texel(t_in, in_pos), sum); - const ivec3 in_pos_3 = ivec3(in_l + (k+3) * dilation, in_c, n / 4); - sum = fma(weight.wwww, texelFetch(image_in, in_pos_3, 0), sum); + in_pos[in_axis_map.x] += dilation; + sum = fma(weight.wwww, load_texel(t_in, in_pos), sum); } } - ivec3 out_pos = ivec3(out_l, out_c, n / 4); - imageStore(image_out, out_pos, op(sum + bias.x, out_min, out_max)); + const ivec3 out_lpos = ivec3(out_l, out_c, n / 4); + write_texel_lpos(t_out, out_lpos, op(sum + bias.x, out_min, out_max), out_axis_map); } } } diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv1d.yaml index 16c4172510c..2266649d2b9 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv1d.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/conv1d.yaml @@ -7,9 +7,8 @@ conv1d: parameter_names_with_default_values: OPERATOR: X - NDIM: 3 DTYPE: float - PACKING: C_packed + STORAGE: texture3d generate_variant_forall: DTYPE: - VALUE: half diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl index 18202e4a51f..49ce76423d5 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl @@ -53,7 +53,7 @@ void main() { } // Map tensor_idx to normal buffer_i - const ivec4 p0 = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim); + const ivec4 p0 = tidx_to_nchwi(idx, sizes, packed_dim); // Compute modified tensor_idx by inverting the CPU function const int N = original_sizes.w; diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl index 493a614ee81..4e8bff94947 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl @@ -53,7 +53,7 @@ void main() { } // Map tensor_idx to normal buffer_i - const ivec4 p0 = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim); + const ivec4 p0 = tidx_to_nchwi(idx, sizes, packed_dim); // Compute modified tensor_idx by inverting the CPU function const int N = original_sizes.w; diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl index d2978ffe7e6..df8589e737f 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl @@ -53,7 +53,7 @@ void main() { } // Map tensor_idx to normal buffer_i - const ivec4 p0 = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim); + const ivec4 p0 = tidx_to_nchwi(idx, sizes, packed_dim); // Compute modified tensor_idx by inverting the CPU function const int N = original_sizes.w; diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl index 50ddb92c349..d709578b1c9 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl @@ -10,19 +10,16 @@ #define PRECISION ${PRECISION} +#include "indexing_utils.h" + layout(std430) buffer; -${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)} +${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} +${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} -layout(set = 0, binding = 2) uniform PRECISION restrict CopyArgs { - ivec3 range; - int unused0; - ivec3 src_offset; - int unused1; - ivec3 dst_offset; - int unused2; -}; +${layout_declare_ubo(B, "ivec3", "range", "ivec3", "src_offset", "ivec3", "dst_offset")} +${layout_declare_ubo(B, "ivec4", "out_axis_map")} +${layout_declare_ubo(B, "ivec4", "in_axis_map")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; @@ -36,5 +33,9 @@ void main() { return; } - imageStore(t_out, out_pos, texelFetch(t_in, in_pos, 0)); + write_texel_lpos( + t_out, + out_pos, + load_texel_lpos(t_in, in_pos, in_axis_map), + out_axis_map); } diff --git a/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl b/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl index 3adffe99bdb..0a3eeee257f 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl @@ -16,34 +16,36 @@ layout(std430) buffer; #include "indexing_utils.h" -${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(1, "r", "t_in", "int", STORAGE)} -${layout_declare_tensor(2, "r", "t_weight", DTYPE, STORAGE)} -${layout_declare_ubo(3, "ivec4", "sizes")} +${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} +${layout_declare_tensor(B, "r", "t_in", "int", STORAGE)} +${layout_declare_tensor(B, "r", "t_weight", DTYPE, STORAGE)} +${layout_declare_ubo(B, "ivec4", "sizes")} +${layout_declare_ubo(B, "ivec4", "out_axis_map")} +${layout_declare_ubo(B, "ivec4", "in_axis_map")} +${layout_declare_ubo(B, "ivec4", "weight_axis_map")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; layout(constant_id = 3) const int packed_dim = C_DIM; void main() { - const ivec3 out_pos = ivec3(gl_GlobalInvocationID); - - if (pos_out_of_bounds(out_pos, sizes, packed_dim)) { + const ivec3 out_lpos = ivec3(gl_GlobalInvocationID); + const ivec4 out_tidx = lpos_to_tidx(out_lpos, sizes, out_axis_map.w, packed_dim); + if (any(greaterThanEqual(out_tidx, sizes))) { return; } - - const ivec4 out_idx = to_tensor_idx(out_pos, sizes, packed_dim); VEC4_T out_texel; // Consider optimizing via W-packing format for t_in and t_weight. for (int i = 0; i < 4; ++i) { // Read input tensor for embedding index. - const ivec3 in_pos = ivec3(out_pos.y, out_idx.z * 4 + i, out_idx.w / 4); - const int in_texel_elem = texelFetch(t_in, in_pos, 0)[out_idx.w % 4]; + const ivec3 in_lpos = ivec3(out_tidx.y, out_tidx.z * 4 + i, out_tidx.w / 4); + const int in_texel_elem = load_texel_lpos(t_in, in_lpos, in_axis_map)[out_tidx.w % 4]; // Read weight tensor for embedding. - out_texel[i] = texelFetch(t_weight, ivec3(out_pos.x, in_texel_elem, 0), 0).x; + const ivec3 weight_lpos = ivec3(out_tidx.x, in_texel_elem, 0); + out_texel[i] = load_texel_lpos(t_weight, weight_lpos, weight_axis_map).x; } - imageStore(t_out, out_pos, out_texel); + write_texel_lpos(t_out, out_lpos, out_texel, out_axis_map); } diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl index b51d5a3f6ed..be3901799f8 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl @@ -21,16 +21,17 @@ ${define_required_extensions(DTYPE)} layout(std430) buffer; -${layout_declare_buffer(0, "w", "nchw_out", DTYPE)} -${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)} -${layout_declare_ubo(2, "ivec4", "sizes")} +${layout_declare_buffer(B, "w", "nchw_out", DTYPE)} +${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} +${layout_declare_ubo(B, "ivec4", "sizes")} +${layout_declare_ubo(B, "ivec4", "axis_map")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; layout(constant_id = 3) const int packed_dim = C_DIM; void write_out_texel(VEC4_T texel, ivec4 tensor_idx) { - const ivec4 buf_indices = get_texel_nchw_buffer_ixs( + const ivec4 buf_indices = tidx_to_nchwi( tensor_idx, sizes, packed_dim); @@ -50,13 +51,13 @@ void write_out_texel(VEC4_T texel, ivec4 tensor_idx) { } void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 tensor_idx = to_tensor_idx(pos, sizes, packed_dim); + const ivec3 lpos = ivec3(gl_GlobalInvocationID); + const ivec4 tidx = lpos_to_tidx(lpos, sizes, axis_map.w, packed_dim); - if (any(greaterThanEqual(tensor_idx, sizes))) { + if (any(greaterThanEqual(tidx, sizes))) { return; } - const VEC4_T intex = load_texel(t_in, pos); - write_out_texel(intex, tensor_idx); + const VEC4_T intex = load_texel(t_in, lpos_to_pos(lpos, axis_map)); + write_out_texel(intex, tidx); } diff --git a/backends/vulkan/runtime/graph/ops/glsl/index_select_channel.glsl b/backends/vulkan/runtime/graph/ops/glsl/index_select_channel.glsl index ba60000f3d4..76ec540838c 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/index_select_channel.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/index_select_channel.glsl @@ -34,18 +34,18 @@ void main() { } const ivec4 idx = to_tensor_idx(out_pos, out_sizes, packed_dim); - const ivec4 buffer_ixs = get_texel_nchw_buffer_ixs(idx, out_sizes, packed_dim); + const ivec4 buffer_ixs = tidx_to_nchwi(idx, out_sizes, packed_dim); VEC4_T out_texel; for (int i = 0; i < 4; ++i) { - const ivec4 out_idx = from_nchw_buffer_i(buffer_ixs[i], out_sizes); - int out_channel = out_idx.z; + const ivec4 out_tidx = nchwi_to_tidx(buffer_ixs[i], out_sizes); + int out_channel = out_tidx.z; int in_channel = texelFetch(t_idx, ivec3(out_channel, 0, 0), 0).x; - ivec4 in_idx = out_idx; - in_idx.z = in_channel; + ivec4 in_tidx = out_tidx; + in_tidx.z = in_channel; - ivec4 in_elem_pos = to_texture_elem_pos(in_idx, in_sizes, packed_dim); + ivec4 in_elem_pos = to_texture_elem_pos(in_tidx, in_sizes, packed_dim); VEC4_T in_texel = texelFetch(t_in, in_elem_pos.xyz, 0); diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h index 21eadff0b36..cf6686ee08c 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h +++ b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h @@ -9,32 +9,44 @@ #ifndef INDEXING_UTILS_H #define INDEXING_UTILS_H -// Width Dim Index, assuming (W, H, C, N) order +/* + * The functions defined in this header file use the following shorthand to + * represent tensor related data structures. + * + * tidx - ivec4 tensor indices, listed in WHCN order. + * + * pos - ivec3 texel position, used to fetch from an image texture via the + * texelFetch(image, pos, lod) GLSL function. + * posi - ivec4 texel element position. It is the same as pos, except with an + * additional component of the index of an element within the texel. + * lpos - ivec3 logical position, listed in WHC order. This is a permutation of + * texture position based on a tensor's axis_map. lpos.x is the position + * component that corresponds to the tensor's width dimension, lpos.y is + * the position component that corresponds to the tensor's height dim, + * and so on. + * + * bufi - int index into a GPU buffer that backs a tensor. + * nchwi - int index into a staging buffer for a tensor. The data in the + * staging buffer is stored in contiguous data layout, irrespective of + * the tensor's strides. + */ + +// Width Dim Index, assuming WHCN order #define W_DIM 0 -// Height, assuming (W, H, C, N) order +// Height, assuming WHCN order #define H_DIM 1 -// Channels, assuming (W, H, C, N) order +// Channels, assuming WHCN order #define C_DIM 2 /* - * Describes which texture axis the "batches" dimension runs along in a 4D - * texture. - * - * Currently it is set to 2 since we represent batches by concatenating along - * the channels dim, which has index 2 in (W, H, C, N) order and maps to the - * depth dimension of a texture, which also corresponds to index 2 in (x, y, z) - * order. + * Fast division by 4 using bit shifting */ -#define BATCH_AXIS 2 - -// -// Basic Indexing Utility Macros and Functions -// +#define div4(x) (x >> 2) /* * Divides input and rounds up to 4 */ -#define divup4(x) ((x + 3) / 4) +#define divup4(x) ((x + 3) >> 2) /* * Aligns input to the next multiple of 4 @@ -42,8 +54,8 @@ #define alignup4(x) ((x + 3) & -4) /* - * Input: (W, H, C, N) strides of a tensor - * Returns: the WHCN index of the fastest moving dimension + * Find the packed dimension of a tensor given its strides. The packed dimension + * is the "fastest moving" dimension which will have a stride of 1. */ int find_packed_dim(const ivec4 strides) { int packed_dim = 0; @@ -56,83 +68,179 @@ int find_packed_dim(const ivec4 strides) { return packed_dim; } -// -// (w, h, c, n) Tensor Index <-> Contiguous Buffer Index Conversion -// - /* - * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of a tensor, which dim - * is packed along a texel - * Output: A ivec4 containing the buffer indices corresponding to each texel - * element. + * Get the staging buffer indices that contain the data of the texel that + * corresponds to the provided tensor index. Since the texel have 4 elements, + * 4 buffer indices will be retrieved. */ -ivec4 get_texel_nchw_buffer_ixs(ivec4 idx, ivec4 sizes, int packed_dim) { +ivec4 tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes, const int packed_dim) { ivec4 strides = ivec4(1, sizes.x, sizes.x * sizes.y, sizes.x * sizes.y * sizes.z); - int base_i = idx.x * strides.x + idx.y * strides.y + idx.z * strides.z + - idx.w * strides.w; + int base_i = tidx.x * strides.x + tidx.y * strides.y + tidx.z * strides.z + + tidx.w * strides.w; return base_i + ivec4(0, 1, 2, 3) * strides[packed_dim]; } -/* - * Input: Index into a tensor's data buffer, (W, H, C, N) sizes of a tensor - * Returns: The WCHN index of the tensor that corresponds to the specified - * buffer index, assuming the buffer has contiguous memory layout - */ -ivec4 from_nchw_buffer_i(int buf_i, ivec4 sizes) { +ivec4 nchwi_to_tidx(const int nchwi, const ivec4 sizes) { return ivec4( - buf_i % sizes.x, - (buf_i / (sizes.x)) % sizes.y, - (buf_i / (sizes.x * sizes.y)) % sizes.z, - (buf_i / (sizes.x * sizes.y * sizes.z))); + nchwi % sizes.x, + (nchwi / (sizes.x)) % sizes.y, + (nchwi / (sizes.x * sizes.y)) % sizes.z, + (nchwi / (sizes.x * sizes.y * sizes.z))); } -int to_nchw_buffer_i(const ivec4 tensor_idx, const ivec4 sizes) { - return tensor_idx.w * sizes.x * sizes.y * sizes.z + - tensor_idx.z * sizes.x * sizes.y + tensor_idx.y * sizes.x + tensor_idx.x; +int tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes) { + return tidx.w * sizes.x * sizes.y * sizes.z + tidx.z * sizes.x * sizes.y + + tidx.y * sizes.x + tidx.x; } -/* - * Input: Texel buffer index, (W, H, C, N) strides of a tensor, which dim is - * packed along a texel - * Returns: The (w, h, c, n) tensor index corresponding to the buffer element - */ -ivec4 to_tensor_idx(int buffer_id, const ivec4 strides, const int packed_dim) { +// TODO(ssjia): make this function use dim order so that it can work with any +// dim order. Currently it assumes that the dim order is contiguous, except for +// the packed dim. +ivec4 bufi_to_tidx(int bufi, const ivec4 strides, const int packed_dim) { ivec4 idx; for (int i = 3; i >= 0; i--) { if (i != packed_dim) { - idx[i] = buffer_id / strides[i]; - buffer_id %= strides[i]; + idx[i] = bufi / strides[i]; + bufi %= strides[i]; } } - idx[packed_dim] = buffer_id; + idx[packed_dim] = bufi; return idx; } -/* - * Input: Texel buffer index, (W, H, C, N) strides of a tensor - * Returns: The (w, h, c, n) tensor index corresponding to the buffer element - * - * This is a convenience overload of the above function. If the packed dim is - * not known, it can be found by finding the first dimension with a stride of 1. - * However, this process adds some overhead, so if performance is a concern then - * the above function should be used instead so that the packed dim is provided. - */ -ivec4 to_tensor_idx(int buffer_id, const ivec4 strides) { +// Convenience overload of the above function, which will determine the packed +// dim from the strides automatically so it doesn't have to be passed in as a +// function argument. +ivec4 bufi_to_tidx(const int bufi, const ivec4 strides) { int packed_dim = find_packed_dim(strides); - return to_tensor_idx(buffer_id, strides, packed_dim); + return bufi_to_tidx(bufi, strides, packed_dim); +} + +int tidx_to_bufi(const ivec4 tidx, ivec4 strides) { + return tidx.x * strides.x + tidx.y * strides.y + tidx.z * strides.z + + tidx.w * strides.w; +} + +ivec4 lpos_to_tidx( + ivec3 lpos, + ivec4 sizes, + const int batch_inner_dim, + const int packed_dim) { + // Align packed dim to next multiple of 4 to account for texel padding + sizes[packed_dim] = alignup4(sizes[packed_dim]); + // Moving 1 texel along the packed dim traverses 4 tensor elements + lpos[packed_dim] *= 4; + + ivec4 tidx = ivec4(lpos, 0); + + if (sizes.w > 1) { + tidx.w = tidx[batch_inner_dim] / sizes[batch_inner_dim]; + tidx[batch_inner_dim] %= sizes[batch_inner_dim]; + } + return tidx; +} + +ivec3 tidx_to_lpos( + ivec4 tidx, + ivec4 sizes, + const int batch_inner_dim, + const int packed_dim) { + // Align packed dim to next multiple of 4 to account for texel padding + sizes[packed_dim] = alignup4(sizes[packed_dim]); + + ivec3 lpos = tidx.xyz; + + // Adjust batch inner dim by batch index if needed + if (sizes.w > 1) { + lpos[batch_inner_dim] += tidx.w * sizes[batch_inner_dim]; + } + // Fast division by 4, since moving 1 texel along the packed dim traverses 4 + // tensor elements. + lpos[packed_dim] >>= 2; + return lpos; +} + +ivec3 tidx_to_pos( + ivec4 tidx, + ivec4 sizes, + const ivec4 axis_map, + const int packed_dim) { + // Align packed dim to next multiple of 4 to account for texel padding + sizes[packed_dim] = alignup4(sizes[packed_dim]); + + ivec3 pos; + for (int dim = 0; dim < 3; ++dim) { + pos[axis_map[dim]] = tidx[dim]; + } + + // Adjust batch inner dim by batch index if needed + if (sizes.w > 1) { + pos[axis_map[axis_map.w]] += tidx.w * sizes[axis_map.w]; + } + // Fast division by 4, since moving 1 texel along the packed dim traverses 4 + // tensor elements. + pos[axis_map[packed_dim]] >>= 2; + return pos; +} + +ivec4 tidx_to_posi( + ivec4 tidx, + ivec4 sizes, + const ivec4 axis_map, + const int packed_dim) { + return ivec4( + tidx_to_pos(tidx, sizes, axis_map, packed_dim), tidx[packed_dim] % 4); +} + +ivec3 lpos_to_pos(const ivec3 lpos, const ivec4 axis_map) { + ivec3 pos; + pos[axis_map.x] = lpos.x; + pos[axis_map.y] = lpos.y; + pos[axis_map.z] = lpos.z; + return pos; } +#ifdef USING_BUFFER +#define load_texel(buf, idx) buf[idx] +#elif defined(USING_TEXTURE2D) +#define load_texel(im, pos) texelFetch(im, pos.xy, 0) +#else // defined(USING_TEXTURE3D) +#define load_texel(im, pos) texelFetch(im, pos, 0) +#define load_texel_lpos(im, lpos, axis_map) \ + texelFetch(im, lpos_to_pos(lpos, axis_map), 0) +#endif + +#ifdef USING_BUFFER +#define write_texel(buf, idx, texel) buf[idx] = texel +#elif defined(USING_TEXTURE2D) +#define write_texel(im, pos, texel) imageStore(im, pos.xy, texel) +#else // defined(USING_TEXTURE3D) +#define write_texel(im, pos, texel) imageStore(im, pos, texel) +#define write_texel_lpos(im, lpos, texel, axis_map) \ + imageStore(im, lpos_to_pos(lpos, axis_map), texel) +#endif + +/************************ + * Deprecated Functions * + ************************/ + +// The below functions and macros are in the process of being deprecated in +// favor of newer indexing functions that account for axis mapping and have more +// explicit function names and more updated terminology. + /* - * Input: (w, h, c, n) tensor index, (W, H, C, N) strides of the tensor buffer - * Returns: the buffer index corresponding to the specified tensor index + * Describes which texture axis the "batches" dimension runs along in a 4D + * texture. + * + * Currently it is set to 2 since we represent batches by concatenating along + * the channels dim, which has index 2 in (W, H, C, N) order and maps to the + * depth dimension of a texture, which also corresponds to index 2 in (x, y, z) + * order. */ -int to_buffer_id(const ivec4 tensor_idx, ivec4 strides) { - return tensor_idx.x * strides.x + tensor_idx.y * strides.y + - tensor_idx.z * strides.z + tensor_idx.w * strides.w; -} +#define BATCH_AXIS 2 // // (w, h, c, n) Tensor Index <-> (x, y, z) Texture Position Conversion @@ -218,26 +326,6 @@ ivec4 to_texture_elem_pos(ivec4 idx, ivec4 sizes, int packed_dim) { return pos; } -// -// Texel Access and Storage -// - -#ifdef USING_BUFFER -#define load_texel(buf, idx) buf[idx] -#elif defined(USING_TEXTURE2D) -#define load_texel(im, pos) texelFetch(im, pos.xy, 0) -#else // defined(USING_TEXTURE3D) -#define load_texel(im, pos) texelFetch(im, pos, 0) -#endif - -#ifdef USING_BUFFER -#define write_texel(buf, idx, texel) buf[idx] = texel -#elif defined(USING_TEXTURE2D) -#define write_texel(im, pos, texel) imageStore(im, pos.xy, texel) -#else // defined(USING_TEXTURE3D) -#define write_texel(im, pos, texel) imageStore(im, pos, texel) -#endif - // // Miscellaneous Utility Functions and Macros // diff --git a/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl b/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl index b1e3a0abdfe..f7133dd0452 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/int8_image_to_nchw_noint8.glsl @@ -16,10 +16,11 @@ layout(std430) buffer; #extension GL_EXT_control_flow_attributes : require -${layout_declare_buffer(0, "w", "nchw_out", "int")} -${layout_declare_tensor(1, "r", "t_in", "int8", "texture3d")} -${layout_declare_ubo(2, "ivec4", "tensor_sizes")} -${layout_declare_ubo(3, "int", "out_numel")} +${layout_declare_buffer(B, "w", "nchw_out", "int")} +${layout_declare_tensor(B, "r", "t_in", "int8", "texture3d")} +${layout_declare_ubo(B, "ivec4", "tensor_sizes")} +${layout_declare_ubo(B, "ivec4", "axis_map")} +${layout_declare_ubo(B, "int", "out_numel")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; @@ -40,9 +41,9 @@ void main() { int in_buf_idx = 4 * out_buf_idx; [[unroll]] for (int i = 0; i < 4; ++i) { - const ivec4 tensor_idx = from_nchw_buffer_i(in_buf_idx, tensor_sizes); + const ivec4 tidx = nchwi_to_tidx(in_buf_idx, tensor_sizes); const ivec4 texture_pos = to_texture_elem_pos( - tensor_idx, tensor_sizes, packed_dim); + tidx, tensor_sizes, packed_dim); values[i] = load_texel(t_in, texture_pos.xyz)[texture_pos.w]; in_buf_idx++; } diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul.h b/backends/vulkan/runtime/graph/ops/glsl/matmul.h deleted file mode 100644 index 620f1fd0e6e..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/matmul.h +++ /dev/null @@ -1,283 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -// To convince the SPIR-V compiler to unroll the loops optimally, need this -// macro -#define FOUR 4 - -#ifdef TILE_ROW_2 -#define TILE_ROWS 2 -#else -#define TILE_ROWS 4 -#endif - -// we avoid mat4 and vec4 usage here as they compile to much less efficient -// SPIR-V -struct FloatMatrix_2d { - float data[TILE_ROWS][FOUR]; -}; - -struct FloatMatrix_3d { - float data[TILE_ROWS][FOUR][FOUR]; -}; - -#ifdef MAT2_IS_TRANSPOSED -vec4 matmul_naive_W_packed_W_packed( -#else -vec4 matmul_naive_W_packed_H_packed( -#endif - const sampler3D im_mat1, - const sampler3D im_mat2, - const ivec3 out_pos, - const int width) { - ivec3 mat1_pos = ivec3(0, out_pos.y, out_pos.z); -#ifdef MAT2_IS_TRANSPOSED - ivec3 mat2_pos = ivec3(0, out_pos.x * 4, 0); -#else - ivec3 mat2_pos = ivec3(out_pos.x * 4, 0, out_pos.z); -#endif - - vec4 texel = vec4(0); - const int K = (width + 3) / 4; - - for (int i = 0; i < K; ++i) { - const vec4 mat1_tex = texelFetch(im_mat1, mat1_pos, 0); -#ifdef MAT2_IS_TRANSPOSED - const vec4 sums = vec4( - dot(mat1_tex, texelFetch(im_mat2, mat2_pos, 0)), - dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(0, 1, 0), 0)), - dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(0, 2, 0), 0)), - dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(0, 3, 0), 0))); -#else - const vec4 sums = vec4( - dot(mat1_tex, texelFetch(im_mat2, mat2_pos, 0)), - dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(1, 0, 0), 0)), - dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(2, 0, 0), 0)), - dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(3, 0, 0), 0))); -#endif - - texel += sums; - - mat1_pos.x++; -#ifdef MAT2_IS_TRANSPOSED - mat2_pos.x++; -#else - mat2_pos.y++; -#endif - } - - return texel; -} - -#ifdef MAT2_IS_TRANSPOSED -vec4 matmul_naive_W_packed_H_packed( -#else -vec4 matmul_naive_W_packed_W_packed( -#endif - const sampler3D im_mat1, - const sampler3D im_mat2, - const ivec3 out_pos, - const int width) { - ivec3 mat1_pos = ivec3(0, out_pos.y, out_pos.z); - ivec3 mat2_pos = ivec3(out_pos.x, 0, out_pos.z); - - vec4 texel = vec4(0); - int K = divup4(width); - - for (int i = 0; i < K; ++i) { - vec4 mat1_tex = texelFetch(im_mat1, mat1_pos, 0); - texel = fma(mat1_tex.xxxx, texelFetch(im_mat2, mat2_pos, 0), texel); - mat2_pos.y++; - texel = fma(mat1_tex.yyyy, texelFetch(im_mat2, mat2_pos, 0), texel); - mat2_pos.y++; - texel = fma(mat1_tex.zzzz, texelFetch(im_mat2, mat2_pos, 0), texel); - mat2_pos.y++; - texel = fma(mat1_tex.wwww, texelFetch(im_mat2, mat2_pos, 0), texel); - mat2_pos.y++; - - mat1_pos.x++; - } - - return texel; -} - -// get texel from self tensor (width_packed) in addmm -vec4 get_texel_W_packed( - sampler3D im_self, - const ivec3 pos, - const bool broadcast_at_width, - const bool broadcast_at_height) { - vec4 self_texel; - // self is of shape {1} - if (broadcast_at_width && broadcast_at_height) { - self_texel = texelFetch(im_self, ivec3(0, 0, 0), 0).xxxx; - } - // self is of shape {*, 1} - else if (broadcast_at_width) { - self_texel = texelFetch(im_self, ivec3(0, pos.y, 0), 0).xxxx; - } - // self is of shape {1, *} - else if (broadcast_at_height) { - self_texel = texelFetch(im_self, ivec3(pos.x, 0, 0), 0); - } else { - self_texel = texelFetch(im_self, ivec3(pos.x, pos.y, 0), 0); - } - - return self_texel; -} - -// get texel from self tensor (channel_packed) in addmm -vec4 get_texel_C_packed( - sampler3D im_self, - const ivec3 pos, - const bool broadcast_at_width, - const bool broadcast_at_height) { - vec4 self_texel; - // self is of shape {1} - if (broadcast_at_width && broadcast_at_height) { - self_texel = texelFetch(im_self, ivec3(0, 0, 0), 0); - } - // self is of shape {*, 1} - else if (broadcast_at_width) { - self_texel = texelFetch(im_self, ivec3(0, pos.y, 0), 0); - } - // self is of shape {1, *} - else if (broadcast_at_height) { - self_texel = texelFetch(im_self, ivec3(pos.x, 0, 0), 0); - } else { - self_texel = texelFetch(im_self, ivec3(pos.x, pos.y, 0), 0); - } - - return self_texel; -} - -FloatMatrix_2d matmul_partial_2d( - sampler3D im_mat1, - sampler3D im_mat2, - const ivec3 pos, - const int batch_size, - const int K_texel_len) { - FloatMatrix_2d results; - for (int i = 0; i < TILE_ROWS; i++) { - for (int j = 0; j < FOUR; j++) { - results.data[i][j] = 0.0f; - } - } - vec4 im_mat1_partial_load[TILE_ROWS]; - vec4 im_mat2_partial_load[FOUR]; - - for (int mat1_x = 0; mat1_x < K_texel_len; mat1_x++) { - for (int offset = 0; offset < TILE_ROWS; offset++) { - // read and cache 2x4 (or 4x4) tile of im_mat1 - const int mat1_y = (TILE_ROWS * pos.y) + offset; - const ivec3 mat1_pos = ivec3(mat1_x, mat1_y, 0); - im_mat1_partial_load[offset] = texelFetch(im_mat1, mat1_pos, 0); - // read and cache 4x4 tile of im_mat2 -#ifdef MAT2_IS_TRANSPOSED - const int mat2_y = (FOUR * pos.x) + offset; - const ivec3 mat2_pos = ivec3(mat1_x, mat2_y, 0); - im_mat2_partial_load[offset] = texelFetch(im_mat2, mat2_pos, 0); -#else - const int mat2_x = (FOUR * pos.x) + offset; - const ivec3 mat2_pos = ivec3(mat2_x, mat1_x, 0); - im_mat2_partial_load[offset] = texelFetch(im_mat2, mat2_pos, 0); -#endif - } - -#ifdef TILE_ROW_2 -// column 3 and 4 of im_mat2 -#ifdef MAT2_IS_TRANSPOSED - im_mat2_partial_load[2] = - texelFetch(im_mat2, ivec3(mat1_x, (FOUR * pos.x) + 2, 0), 0); - im_mat2_partial_load[3] = - texelFetch(im_mat2, ivec3(mat1_x, (FOUR * pos.x) + 3, 0), 0); -#else - im_mat2_partial_load[2] = - texelFetch(im_mat2, ivec3((FOUR * pos.x) + 2, mat1_x, 0), 0); - im_mat2_partial_load[3] = - texelFetch(im_mat2, ivec3((FOUR * pos.x) + 3, mat1_x, 0), 0); -#endif -#endif - - // perform partial dot products and add partial result to results - for (int out_row = 0; out_row < TILE_ROWS; out_row++) { - for (int out_col = 0; out_col < FOUR; out_col++) { - results.data[out_row][out_col] += - dot(im_mat1_partial_load[out_row], im_mat2_partial_load[out_col]); - } - } - } - return results; -} - -FloatMatrix_3d matmul_partial_3d( - sampler3D im_mat1, - sampler3D im_mat2, - const ivec3 pos, - const int batch_size, - const int K_texel_len) { - FloatMatrix_3d results; - for (int i = 0; i < TILE_ROWS; i++) { - for (int j = 0; j < FOUR; j++) { - for (int k = 0; k < FOUR; k++) { - results.data[i][j][k] = 0.0f; - } - } - } - vec4 im_mat1_partial_load[TILE_ROWS]; - vec4 im_mat2_partial_load[FOUR]; - - for (int batch_idx = 0; batch_idx < FOUR; batch_idx++) { - if (FOUR * pos.z + batch_idx >= batch_size) { - break; - } - int mat_z = FOUR * pos.z + batch_idx; - for (int mat1_x = 0; mat1_x < K_texel_len; mat1_x++) { - for (int offset = 0; offset < TILE_ROWS; offset++) { - // read and cache 2x4 (or 4x4) tile of im_mat1 - const int mat1_y = (TILE_ROWS * pos.y) + offset; - const ivec3 mat1_pos = ivec3(mat1_x, mat1_y, mat_z); - im_mat1_partial_load[offset] = texelFetch(im_mat1, mat1_pos, 0); - // read and cache 4x4 tile of im_mat2 -#ifdef MAT2_IS_TRANSPOSED - const int mat2_y = (FOUR * pos.x) + offset; - const ivec3 mat2_pos = ivec3(mat1_x, mat2_y, 0); - im_mat2_partial_load[offset] = texelFetch(im_mat2, mat2_pos, 0); -#else - const int mat2_x = (FOUR * pos.x) + offset; - const ivec3 mat2_pos = ivec3(mat2_x, mat1_x, mat_z); - im_mat2_partial_load[offset] = texelFetch(im_mat2, mat2_pos, 0); -#endif - } - -#ifdef TILE_ROW_2 -// column 3, and 4 of im_mat2 -#ifdef MAT2_IS_TRANSPOSED - im_mat2_partial_load[2] = - texelFetch(im_mat2, ivec3(mat1_x, (FOUR * pos.x) + 2, 0), 0); - im_mat2_partial_load[3] = - texelFetch(im_mat2, ivec3(mat1_x, (FOUR * pos.x) + 3, 0), 0); -#else - im_mat2_partial_load[2] = - texelFetch(im_mat2, ivec3((FOUR * pos.x) + 2, mat1_x, mat_z), 0); - im_mat2_partial_load[3] = - texelFetch(im_mat2, ivec3((FOUR * pos.x) + 3, mat1_x, mat_z), 0); -#endif -#endif - - // perform partial dot products and add partial result to results - for (int out_row = 0; out_row < TILE_ROWS; out_row++) { - for (int out_col = 0; out_col < FOUR; out_col++) { - results.data[out_row][out_col][batch_idx] += - dot(im_mat1_partial_load[out_row], im_mat2_partial_load[out_col]); - } - } - } - } - return results; -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.glsl index 81f0a815cb9..e4064eed2fa 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.glsl @@ -32,35 +32,29 @@ ${layout_declare_ubo(9, "int", "out_numel")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; void main() { - const ivec4 out_idx = ivec4( + const ivec4 out_bufix = ivec4( gl_GlobalInvocationID.x, gl_GlobalInvocationID.y, gl_GlobalInvocationID.z % out_sizes.z, gl_GlobalInvocationID.z / out_sizes.z); - if (any(greaterThanEqual(out_idx, out_sizes))) { + if (any(greaterThanEqual(out_bufix, out_sizes))) { return; } - int mat1_id = to_buffer_id( - ivec4(0, out_idx.y, out_idx.z, out_idx.w), mat1_strides); - int mat2_id = to_buffer_id( - ivec4(out_idx.x, 0, out_idx.z, out_idx.w), mat2_strides); - - int orig_mat1_id = to_buffer_id( - ivec4(0, out_idx.y, out_idx.z, out_idx.w), mat1_strides); - - int orig_mat2_id = to_buffer_id( - ivec4(out_idx.x, 0, 0, 0), mat2_strides); + int mat1_bufi = tidx_to_bufi( + ivec4(0, out_bufix.y, out_bufix.z, out_bufix.w), mat1_strides); + int mat2_bufi = tidx_to_bufi( + ivec4(out_bufix.x, 0, out_bufix.z, out_bufix.w), mat2_strides); T sum = T(0.0); for (int i = 0; i < mat1_sizes.x; ++i) { - sum += t_mat1[mat1_id] * t_mat2[mat2_id]; + sum += t_mat1[mat1_bufi] * t_mat2[mat2_bufi]; - mat1_id += mat1_strides.x; - mat2_id += mat2_strides.y; + mat1_bufi += mat1_strides.x; + mat2_bufi += mat2_strides.y; } - const int out_id = to_buffer_id(out_idx, out_strides); - t_out[out_id] = T(sum); + const int out_bufi = tidx_to_bufi(out_bufix, out_strides); + t_out[out_bufi] = T(sum); } diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.glsl deleted file mode 100644 index 7225f2c64a0..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.glsl +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -$if MAT2_IS_TRANSPOSED: - #define MAT2_IS_TRANSPOSED - -#include "indexing_utils.h" -#include "matmul.h" - -${layout_declare_tensor(0, "w", "im_out", DTYPE, "texture3d")} -${layout_declare_tensor(1, "r", "im_mat1", DTYPE, "texture3d")} -${layout_declare_tensor(2, "r", "im_mat2", DTYPE, "texture3d")} -${layout_declare_ubo(3, "ivec3", "out_limits")} -${layout_declare_ubo(4, "ivec4", "in_sizes")} - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, out_limits))) { - return; - } - - vec4 texel = vec4(0); - - $if MAT1_PACKING == "W_packed": - $if MAT2_PACKING == "H_packed": - texel = matmul_naive_W_packed_H_packed( - im_mat1, - im_mat2, - pos, - in_sizes[0]); - $elif MAT2_PACKING == "W_packed": - texel = matmul_naive_W_packed_W_packed( - im_mat1, - im_mat2, - pos, - in_sizes[0]); - $else: - $raise Exception("Unsupported value for MAT2_PACKING") - $else: - $raise Exception("Unsupported value combo for MAT1_PACKING and MAT2_PACKING") - - imageStore(im_out, pos, texel); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.yaml b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.yaml deleted file mode 100644 index bb1eed494a5..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -matmul_naive_texture3d: - parameter_names_with_default_values: - DTYPE: float - STORAGE: texture3d - MAT1_PACKING: W_packed - MAT2_PACKING: H_packed - MAT2_IS_TRANSPOSED: false - generate_variant_forall: - DTYPE: - - VALUE: float - - VALUE: half - shader_variants: - - NAME: matmul_naive_texture3d_W_packed_H_packed - - NAME: matmul_naive_texture3d_W_packed_W_packed - MAT2_PACKING: W_packed - - NAME: matmul_transposed_naive_texture3d_W_packed_W_packed - MAT2_PACKING: W_packed - MAT2_IS_TRANSPOSED: true diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl deleted file mode 100644 index 8634371a7b4..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.glsl +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -$if MAT2_IS_TRANSPOSED: - #define MAT2_IS_TRANSPOSED - -$if BATCH_MODE: - #define BATCH_MODE - -$if TILE_ROW == "tile_row_2": - #define TILE_ROW_2 - -#include "indexing_utils.h" -#include "matmul.h" - -layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out; -layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1; -layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2; - -layout(set = 0, binding = 3) uniform PRECISION restrict OutLimits { - ivec3 out_limits; -}; - -layout(set = 0, binding = 4) uniform PRECISION restrict OutSizes { - ivec4 out_sizes; -}; - -layout(set = 0, binding = 5) uniform PRECISION restrict InLimits { - ivec3 in_limits; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; - -void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, out_limits))) { - return; - } - - $if BATCH_MODE: - FloatMatrix_3d results = matmul_partial_3d( - im_mat1, - im_mat2, - pos, - out_sizes[2], - in_limits[0]); - $else: - FloatMatrix_2d results = matmul_partial_2d( - im_mat1, - im_mat2, - pos, - out_sizes[2], - in_limits[0]); - - for (int idx_c = 0; idx_c < TILE_ROWS; idx_c++) { - for (int idx_r = 0; idx_r < FOUR; idx_r++) { - const ivec3 out_pos = - ivec3(idx_r + FOUR * pos.x, idx_c + TILE_ROWS * pos.y, pos.z); - - // results is in transposed order w.r.t. the desired output - $if BATCH_MODE: - imageStore( - im_out, - out_pos, - vec4( - results.data[idx_c][idx_r][0], - results.data[idx_c][idx_r][1], - results.data[idx_c][idx_r][2], - results.data[idx_c][idx_r][3])); - $else: - imageStore( - im_out, - out_pos, - vec4(results.data[idx_c][idx_r], 0.0, 0.0, 0.0)); - } - } -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml b/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml deleted file mode 100644 index 9268d5a25aa..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/matmul_optimized.yaml +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -matmul_optimized: - parameter_names_with_default_values: - DTYPE: float - NDIM: 3 - PACKING: C_packed - MAT2_IS_TRANSPOSED: false - BATCH_MODE: false - TILE_ROW: tile_row_4 - generate_variant_forall: - TILE_ROW: - - VALUE: tile_row_4 - - VALUE: tile_row_2 - DTYPE: - - VALUE: float - - VALUE: half - shader_variants: - - NAME: matmul_optimized - - NAME: matmul_transposed_optimized - MAT2_IS_TRANSPOSED: true - - NAME: batch_matmul_optimized - BATCH_MODE: true - - NAME: batch_matmul_transposed_optimized - MAT2_IS_TRANSPOSED: true - BATCH_MODE: true diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl index d861972f935..ea4e0d300cc 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl @@ -23,13 +23,13 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; layout(constant_id = 3) const int UNUSED_packed_dim = W_DIM; void main() { - int out_id = int(gl_GlobalInvocationID.x); - if (out_id >= numel) { + int out_bufi = int(gl_GlobalInvocationID.x); + if (out_bufi >= numel) { return; } - ivec4 out_idx = to_tensor_idx(out_id, out_strides); - const int in_id = to_nchw_buffer_i(out_idx, out_sizes); + ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides); + const int in_nchwi = tidx_to_nchwi(out_tidx, out_sizes); - t_out[out_id] = nchw_in[in_id]; + t_out[out_bufi] = nchw_in[in_nchwi]; } diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl index abe93904805..b86a59fc234 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl @@ -21,42 +21,43 @@ ${define_required_extensions(DTYPE)} layout(std430) buffer; -${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_buffer(1, "r", "nchw_in", DTYPE)} -${layout_declare_ubo(2, "ivec4", "sizes")} +${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} +${layout_declare_buffer(B, "r", "nchw_in", DTYPE)} +${layout_declare_ubo(B, "ivec4", "sizes")} +${layout_declare_ubo(B, "ivec4", "axis_map")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; layout(constant_id = 3) const int packed_dim = C_DIM; -VEC4_T read_texel(ivec4 tensor_idx) { - const ivec4 buf_indices = get_texel_nchw_buffer_ixs( - tensor_idx, +VEC4_T read_texel(ivec4 tidx) { + const ivec4 buf_indices = tidx_to_nchwi( + tidx, sizes, packed_dim); VEC4_T texel = VEC4_T(0); - if (tensor_idx[packed_dim] < sizes[packed_dim]) { + if (tidx[packed_dim] < sizes[packed_dim]) { texel.x = SCALAR_T(nchw_in[buf_indices.x]); } - if (tensor_idx[packed_dim] + 1 < sizes[packed_dim]) { + if (tidx[packed_dim] + 1 < sizes[packed_dim]) { texel.y = SCALAR_T(nchw_in[buf_indices.y]); } - if (tensor_idx[packed_dim] + 2 < sizes[packed_dim]) { + if (tidx[packed_dim] + 2 < sizes[packed_dim]) { texel.z = SCALAR_T(nchw_in[buf_indices.z]); } - if (tensor_idx[packed_dim] + 3 < sizes[packed_dim]) { + if (tidx[packed_dim] + 3 < sizes[packed_dim]) { texel.w = SCALAR_T(nchw_in[buf_indices.w]); } return texel; } void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 tensor_idx = to_tensor_idx(pos, sizes, packed_dim); - if (any(greaterThanEqual(tensor_idx, sizes))) { + const ivec3 lpos = ivec3(gl_GlobalInvocationID); + const ivec4 tidx = lpos_to_tidx(lpos, sizes, axis_map.w, packed_dim); + if (any(greaterThanEqual(tidx, sizes))) { return; } - write_texel(t_out, pos, read_texel(tensor_idx)); + write_texel(t_out, lpos_to_pos(lpos, axis_map), read_texel(tidx)); } diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl index 378cf09d129..f3a3370f3ba 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8_image_noint8.glsl @@ -16,9 +16,10 @@ layout(std430) buffer; #extension GL_EXT_control_flow_attributes : require -${layout_declare_tensor(0, "w", "t_out", "int8", "texture3d")} -${layout_declare_buffer(1, "r", "nchw_in", "int")} -${layout_declare_ubo(2, "ivec4", "tensor_sizes")} +${layout_declare_tensor(B, "w", "t_out", "int8", "texture3d")} +${layout_declare_buffer(B, "r", "nchw_in", "int")} +${layout_declare_ubo(B, "ivec4", "sizes")} +${layout_declare_ubo(B, "ivec4", "axis_map")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; @@ -34,9 +35,9 @@ int extend_sign(int x) { return x; } -ivec4 read_texel(ivec4 tensor_idx) { - const ivec4 buf_indices = get_texel_nchw_buffer_ixs( - tensor_idx, tensor_sizes, packed_dim); +ivec4 read_texel(ivec4 tidx) { + const ivec4 buf_indices = tidx_to_nchwi( + tidx, sizes, packed_dim); int shift = (1 << 8) - 1; ivec4 masks; @@ -51,7 +52,7 @@ ivec4 read_texel(ivec4 tensor_idx) { ivec4 out_tex = ivec4(0); [[unroll]] for (int i = 0; i < 4; ++i) { - if (tensor_idx[packed_dim] + i < tensor_sizes[packed_dim]) { + if (tidx[packed_dim] + i < sizes[packed_dim]) { int in_texel = nchw_in[buf_indices[i] / 4]; int extracted_val = (in_texel & masks[i]) >> (8 * (buf_indices[i] % 4)); extracted_val = extend_sign(extracted_val); @@ -63,12 +64,12 @@ ivec4 read_texel(ivec4 tensor_idx) { } void main() { - const ivec3 pos = ivec3(gl_GlobalInvocationID); - const ivec4 tensor_idx = to_tensor_idx(pos, tensor_sizes, packed_dim); + const ivec3 lpos = ivec3(gl_GlobalInvocationID); + const ivec4 tidx = lpos_to_tidx(lpos, sizes, axis_map.w, packed_dim); - if (any(greaterThanEqual(tensor_idx, tensor_sizes))) { + if (any(greaterThanEqual(tidx, sizes))) { return; } - write_texel(t_out, pos, read_texel(tensor_idx)); + write_texel(t_out, lpos_to_pos(lpos, axis_map), read_texel(tidx)); } diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl new file mode 100644 index 00000000000..de42f9ed996 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl @@ -0,0 +1,135 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#include "indexing_utils.h" + +#define PRECISION ${PRECISION} + +#define FOUR 4 + +#define VEC4_T ${texel_load_type(DTYPE, STORAGE)} +#define FLOAT_T ${buffer_scalar_type(DTYPE)} + +${define_active_storage_type(STORAGE)} + +${define_required_extensions(DTYPE)} +${define_required_extensions("int8")} + +layout(std430) buffer; + +${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} +${layout_declare_tensor(1, "r", "t_mat1", DTYPE, STORAGE)} +${layout_declare_tensor(2, "r", "t_mat2", "int8", "buffer")} +${layout_declare_tensor(3, "r", "t_scales_and_zeros", DTYPE, STORAGE)} + +$if STORAGE == "texture3d": + ${layout_declare_ubo(4, "ivec4", "out_sizes")} + ${layout_declare_ubo(5, "ivec4", "mat1_sizes")} + ${layout_declare_ubo(6, "ivec4", "mat2_strides")} + ${layout_declare_ubo(7, "ivec4", "scales_strides")} +$else: + ${layout_declare_ubo(4, "ivec4", "out_sizes")} + ${layout_declare_ubo(5, "ivec4", "out_strides")} + ${layout_declare_ubo(6, "ivec4", "mat1_sizes")} + ${layout_declare_ubo(7, "ivec4", "mat1_strides")} + ${layout_declare_ubo(8, "ivec4", "mat2_strides")} + ${layout_declare_ubo(9, "ivec4", "scales_strides")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +layout(constant_id = 3) const int group_size = 1; + +void main() { + + const ivec4 out_pos = ivec4( + gl_GlobalInvocationID.x, // n = 0..N-1 + gl_GlobalInvocationID.y, // m = 0..M-1 + gl_GlobalInvocationID.z % out_sizes.z, + gl_GlobalInvocationID.z / out_sizes.z); + + if (any(greaterThanEqual(out_pos, out_sizes))) { + return; + } + + const uint K = mat1_sizes.x; + const uint n = out_pos.x; + const uint m = out_pos.y; + const uint mask = uint(0x0f); + + float rc = 0.0; + int k = 0; + const uint k_block = (K + group_size - 1) / group_size; + + #ifdef USING_BUFFER + ivec4 mat1_pos = ivec4(0, m, out_pos.z, out_pos.w); + ivec4 mat2_pos = ivec4(0, n, out_pos.z, out_pos.w); + ivec4 scale_pos = ivec4(0, n, 0, out_pos.w); + ivec4 zero_pos = ivec4(0, n, 1, out_pos.w); + + for (int kb = 0; kb < k_block; kb++) { + scale_pos.x = kb; + const int scale_bufi = tidx_to_bufi(scale_pos, scales_strides); + const float scale = float(t_scales_and_zeros[scale_bufi]); + + zero_pos.x = kb; + const int zero_bufi = tidx_to_bufi(zero_pos, scales_strides); + const float zero = float(t_scales_and_zeros[zero_bufi]) - scale * 8.0; + + for(uint idx = 0; idx < group_size && k < K; idx++, k++) { + mat1_pos.x = k; + const int mat1_bufi = tidx_to_bufi(mat1_pos, mat1_strides); + const float mat1_val = float(t_mat1[mat1_bufi]); + + mat2_pos.x = k / 2; + const int mat2_bufi = tidx_to_bufi(mat2_pos, mat2_strides); + // Bitwise op treats sign bit from int8 as a value bit instead, + // since there is no uint8_t datatype + uint mat2_val = (t_mat2[mat2_bufi] & 0xFF); + mat2_val = (k & 1) == 0 ? mat2_val & mask : (mat2_val >> 4); + + rc += mat1_val * (scale * float(mat2_val) + zero); + } + } + + const int out_bufi = tidx_to_bufi(out_pos, out_strides); + t_out[out_bufi] = FLOAT_T(rc); + + #else // Using texture + ivec3 mat1_pos = ivec3(0, m, out_pos.z); + ivec4 mat2_pos = ivec4(0, n, out_pos.z, out_pos.w); + ivec3 scale_zero_pos = ivec3(0, n, 0); + uint K_texel = K / FOUR; + + for (int kb = 0; kb < k_block; kb++) { + scale_zero_pos.x = kb; + const vec4 scale_zero = load_texel(t_scales_and_zeros, scale_zero_pos); + const float scale = scale_zero.x; + const float zero = scale_zero.y - scale * 8.0; + + for(uint idx = 0; idx < group_size && k < K_texel; idx += FOUR, k++) { + mat1_pos.x = k; + const VEC4_T mat1_tex = load_texel(t_mat1, mat1_pos); + + mat2_pos.x = k * 2; // k * FOUR / 2 + const int mat2_id = tidx_to_bufi(mat2_pos, mat2_strides); + + for (int texel_pos = 0; texel_pos < FOUR; texel_pos++) { + // Bitwise op treats sign bit from int8 as a value bit instead, + // since there is no uint8_t datatype + uint mat2_val = (t_mat2[mat2_id + texel_pos / 2] & 0xFF); + mat2_val = (texel_pos & 1) == 0 ? mat2_val & mask : (mat2_val >> 4); + rc += mat1_tex[texel_pos] * (scale * float(mat2_val) + zero); + } + } + } + write_texel(t_out, out_pos.xyz, vec4(rc, 0, 0, 0)); + + #endif +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml new file mode 100644 index 00000000000..fd65068080a --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml @@ -0,0 +1,19 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +q_4w_linear: + parameter_names_with_default_values: + DTYPE: float + STORAGE: buffer + generate_variant_forall: + DTYPE: + - VALUE: float + - VALUE: half + STORAGE: + - VALUE: buffer + - VALUE: texture3d + shader_variants: + - NAME: q_4w_linear diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl index 7557a7b0c3d..a72df89b634 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl @@ -49,14 +49,14 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; #ifdef USING_BUFFER void main() { - const int t_id = int(gl_GlobalInvocationID.x); - if (t_id >= out_numel) { + const int out_bufi = int(gl_GlobalInvocationID.x); + if (out_bufi >= out_numel) { return; } - const ivec4 out_idx = to_tensor_idx(t_id, out_strides, 0); + const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, 0); - t_out[t_id] = q_8w_linear(out_idx, mat1_sizes.x); + t_out[out_bufi] = q_8w_linear(out_tidx, mat1_sizes.x); } #else // USING_TEXTURE diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl b/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl index d1562d65762..45e6c3358e8 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl @@ -43,11 +43,11 @@ void main() { // we calculate the source whcn-coordinate amended with offset-ed channel // value. Then we calculate the actual texture position from the // whcn-coordinate. - const ivec4 buf_indices = get_texel_nchw_buffer_ixs(idx, out_sizes, packed_dim); + const ivec4 buf_indices = tidx_to_nchwi(idx, out_sizes, packed_dim); vec4 outex; for (int i=0;i<4;i++) { - ivec4 user_coor = from_nchw_buffer_i(buf_indices[i], out_sizes); + ivec4 user_coor = nchwi_to_tidx(buf_indices[i], out_sizes); int in_channel = user_coor.z; diff --git a/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml b/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml index eb05b10b108..2b9f0032f41 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml @@ -38,3 +38,5 @@ unary_op: OPERATOR: hardshrink(X, A, B) - NAME: hardswish OPERATOR: hardswish(X) + - NAME: hardsigmoid + OPERATOR: hardsigmoid(X) diff --git a/backends/vulkan/runtime/graph/ops/glsl/view.glsl b/backends/vulkan/runtime/graph/ops/glsl/view.glsl index 0b0f587d1d5..8d45e65b396 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/view.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/view.glsl @@ -39,13 +39,13 @@ void main() { // Assume there is a virtual continous buffer in nchw format. From the output // pos, we first calculate the index in the virual buffer, and then calculate // the input position from the indx. - const ivec4 buf_indices = get_texel_nchw_buffer_ixs(out_tensor_idx, out_sizes, out_packed_dim); + const ivec4 buf_indices = tidx_to_nchwi(out_tensor_idx, out_sizes, out_packed_dim); VEC4_T value = VEC4_T(0); // Need to look up the 4 values in the output texel separately. for (int i = 0 ; i < 4; i++) { if (out_tensor_idx[out_packed_dim]++ < out_sizes[out_packed_dim]) { - ivec4 user_coor = from_nchw_buffer_i(buf_indices[i], in_sizes); + ivec4 user_coor = nchwi_to_tidx(buf_indices[i], in_sizes); ivec4 in_pos_elem = to_texture_elem_pos(user_coor, in_sizes, in_packed_dim); VEC4_T intex = texelFetch(t_in, in_pos_elem.xyz, 0); value[i] = intex[in_pos_elem.w]; diff --git a/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp b/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp index 8e346bd2088..eb0f1f99a2f 100644 --- a/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp @@ -88,7 +88,7 @@ void add_native_batch_norm_node( {{out_ref, vkapi::MemoryAccessType::WRITE}, {{in_ref, arg_weight, arg_bias, arg_mean, arg_var}, vkapi::MemoryAccessType::READ}}, - {t_out->texture_limits_ubo(), + {t_out->logical_limits_ubo(), graph.create_params_buffer(epsilon), graph.create_params_buffer(num_texel_per_batch)})); } diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp index 6bab8d19111..3ae67489af9 100644 --- a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp @@ -21,7 +21,7 @@ void check_binary_op_args( const api::vTensor& self, const api::vTensor& other, const api::vTensor& out) { - VK_CHECK_COND(check_same_memory_layout(self, other, out)); + VK_CHECK_COND(check_same_packed_dim(self, other, out)); std::vector broadcasted_sizes = calculate_broadcasted_output_size(self, other); VK_CHECK_COND(out.sizes() == broadcasted_sizes); @@ -53,7 +53,7 @@ void add_binary_op_node( const std::string& op_name) { ValueRef arg1 = prepack_if_tensor_ref(graph, in1); ValueRef arg2 = - prepack_if_tensor_ref(graph, in2, graph.memory_layout_of(arg1)); + prepack_if_tensor_ref(graph, in2, graph.estimate_memory_layout_of(arg1)); vTensorPtr t_in1 = graph.get_tensor(arg1); vTensorPtr t_in2 = graph.get_tensor(arg2); @@ -85,12 +85,15 @@ void add_binary_op_node( {{arg1, arg2}, vkapi::MemoryAccessType::READ}}, // Shader params buffers {t_out->sizes_ubo(), + t_out->axis_map_ubo(), t_in1->sizes_ubo(), + t_in1->axis_map_ubo(), t_in2->sizes_ubo(), + t_in2->axis_map_ubo(), graph.create_params_buffer(broadcast_params), graph.create_params_buffer(alpha_val)}, // Specialization Constants - {SV(t_out->packed_dim_whcn_idx())}, + {SV(t_out->packed_dim())}, // Resizing Logic resize_binary_op_node, {})); diff --git a/backends/vulkan/runtime/graph/ops/impl/Cat.cpp b/backends/vulkan/runtime/graph/ops/impl/Cat.cpp index cd947091bc1..d5cfd5f4505 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Cat.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Cat.cpp @@ -25,7 +25,7 @@ void add_cat_default_node( for (ValueRef input_ref : *input_list) { vTensorPtr t_in = graph.get_tensor(input_ref); - VK_CHECK_COND(check_memory_layout_is(*t_in, utils::kChannelsPacked)); + VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim)); } int64_t dim = graph.extract_scalar(dim_ref); @@ -40,7 +40,7 @@ void add_cat_default_node( for (ValueRef input_ref : *input_list) { vTensorPtr t_in = graph.get_tensor(input_ref); - utils::ivec3 range = t_in->texture_limits(); + utils::ivec3 range = t_in->logical_limits(); add_copy_offset_node( graph, input_ref, range, src_offset, dst_offset, out); dst_offset[0] += range[0]; @@ -52,7 +52,7 @@ void add_cat_default_node( for (ValueRef input_ref : *input_list) { vTensorPtr t_in = graph.get_tensor(input_ref); - utils::ivec3 range = t_in->texture_limits(); + utils::ivec3 range = t_in->logical_limits(); add_copy_offset_node( graph, input_ref, range, src_offset, dst_offset, out); dst_offset[1] += range[1]; @@ -63,7 +63,7 @@ void add_cat_default_node( for (ValueRef input_ref : *input_list) { vTensorPtr t_in = graph.get_tensor(input_ref); - utils::ivec3 range = t_in->texture_limits(); + utils::ivec3 range = t_in->logical_limits(); add_copy_offset_node( graph, input_ref, range, src_offset, dst_offset, out); dst_offset[2] += range[2]; diff --git a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp index cef751bc7c8..946a0c9f407 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp @@ -32,7 +32,7 @@ void add_clone_node( graph.create_local_wg_size(out), {{out, vkapi::MemoryAccessType::WRITE}, {in, vkapi::MemoryAccessType::READ}}, - {t_out->texture_limits_ubo()})); + {t_out->logical_limits_ubo()})); } void clone(ComputeGraph& graph, const std::vector& args) { diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp index 52af0542b6a..360193fb17f 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp @@ -106,9 +106,9 @@ ValueRef prepack_biases( graph.create_local_wg_size(v), vref, v, - {t->sizes_ubo()}, + {t->sizes_ubo(), t->axis_map_ubo()}, // Specialization constants - {SV(t->packed_dim_whcn_idx())})); + {SV(t->packed_dim())})); return v; } @@ -216,14 +216,14 @@ ValueRef prepack_weights( graph.create_params_buffer( utils::make_ivec4(original_sizes, /*reverse = */ true))}, // Specialization constants - {SV(t->packed_dim_whcn_idx())})); + {SV(t->packed_dim())})); return v; } void check_conv_args(const api::vTensor& in, const api::vTensor& out) { - VK_CHECK_COND(check_memory_layout_is(in, utils::kChannelsPacked)); - VK_CHECK_COND(check_memory_layout_is(out, utils::kChannelsPacked)); + VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim)); + VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim)); } struct Conv2dParams final { @@ -291,7 +291,7 @@ utils::uvec3 create_conv2d_global_wg_size( const Conv2dMethod method, const ValueRef out) { if (method == Conv2dMethod::Pointwise) { - const utils::uvec3 image_extents = graph.image_extents_of(out); + const utils::uvec3 image_extents = graph.logical_limits_of(out); return { utils::div_up(image_extents[0u], 2u), utils::div_up(image_extents[1u], 2u), @@ -376,7 +376,7 @@ void add_conv2d_node( {{arg_in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}}, // Shader params buffers { - t_out->texture_limits_ubo(), + t_out->logical_limits_ubo(), t_in->sizes_ubo(), graph.create_params_buffer(kernel_params), graph.create_params_buffer(extra_params), @@ -444,7 +444,7 @@ void add_conv1d_node( int32_t out_group_size = static_cast(out_channels / groups_val); utils::uvec3 global_size = {1, static_cast(out_channels), 1}; - utils::uvec3 local_size = {1, 1, 1}; + utils::uvec3 local_size = {1, 64, 1}; Kernel1dParams kernel_params = { kernel_size, @@ -474,8 +474,12 @@ void add_conv1d_node( {{arg_in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}}, // Shader params buffers { - t_out->texture_limits_ubo(), + t_out->logical_limits_ubo(), t_in->sizes_ubo(), + t_out->axis_map_ubo(), + t_in->axis_map_ubo(), + t_weight->axis_map_ubo(), + t_bias->axis_map_ubo(), graph.create_params_buffer(kernel_params), graph.create_params_buffer(out_params), }, @@ -562,6 +566,7 @@ void conv(ComputeGraph& graph, const std::vector& args) { REGISTER_OPERATORS { VK_REGISTER_OP(aten.convolution.default, conv); VK_REGISTER_OP(conv_with_clamp.default, conv); + VK_REGISTER_OP(et_vk.conv_with_clamp.default, conv); } } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp index b15844e1409..1fe65611d9f 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp @@ -33,19 +33,13 @@ void add_copy_offset_node( add_dtype_suffix(kernel_name, *t_out); const struct Block final { - ivec3 range; - int32_t unused0; - ivec3 src_offset; - int32_t unused1; - ivec3 dst_offset; - int32_t unused2; + alignas(16) ivec3 range; + alignas(16) ivec3 src_offset; + alignas(16) ivec3 dst_offset; } offset_params{ range, - 0, src_offset, - 0, dst_offset, - 0, }; auto shader = VK_KERNEL_FROM_STR(kernel_name); @@ -61,7 +55,11 @@ void add_copy_offset_node( {in, vkapi::MemoryAccessType::READ}, }, // Parameter buffers - {graph.create_params_buffer(offset_params)}, + { + graph.create_params_buffer(offset_params), + t_out->axis_map_ubo(), + t_in->axis_map_ubo(), + }, // Specialization Constants {})); } @@ -80,8 +78,8 @@ void add_copy_channel_offset_node( std::vector in_sizes = t_in->sizes(); std::vector out_sizes = t_out->sizes(); - VK_CHECK_COND(check_memory_layout_is(*t_in, utils::kChannelsPacked)); - VK_CHECK_COND(check_memory_layout_is(*t_out, utils::kChannelsPacked)); + VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim)); + VK_CHECK_COND(check_packed_dim_is(*t_out, WHCN::kChannelsDim)); // NOTE: This function should be able to support 1d and 2d tensors when // range=1, src_offset=dst_offset=1. diff --git a/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp b/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp index be0b457b79c..2d733b4964c 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp @@ -21,9 +21,9 @@ void check_embedding_args( const api::vTensor& weight, const api::vTensor& in, const api::vTensor& out) { - VK_CHECK_COND(check_memory_layout_is(weight, utils::kChannelsPacked)); - VK_CHECK_COND(check_memory_layout_is(in, utils::kChannelsPacked)); - VK_CHECK_COND(check_memory_layout_is(out, utils::kChannelsPacked)); + VK_CHECK_COND(check_packed_dim_is(weight, WHCN::kChannelsDim)); + VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim)); + VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim)); } void add_embedding_node( @@ -48,7 +48,12 @@ void add_embedding_node( graph.create_local_wg_size(out), {{out, vkapi::MemoryAccessType::WRITE}, {{in, weight}, vkapi::MemoryAccessType::READ}}, - {t_out->sizes_ubo()})); + { + t_out->sizes_ubo(), + t_out->axis_map_ubo(), + t_in->axis_map_ubo(), + t_weight->axis_map_ubo(), + })); } void embedding(ComputeGraph& graph, const std::vector& args) { diff --git a/backends/vulkan/runtime/graph/ops/impl/Full.cpp b/backends/vulkan/runtime/graph/ops/impl/Full.cpp index 157515e6e0a..34acb43c668 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Full.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Full.cpp @@ -54,7 +54,7 @@ void add_full_node( // Shader params buffers {t_out->sizes_ubo(), graph.create_params_buffer(fill_value_val)}, // Specialization Constants - {SV(t_out->packed_dim_whcn_idx())}, + {SV(t_out->packed_dim())}, // Resizing Logic resize_full_node, {size_or_in})); diff --git a/backends/vulkan/runtime/graph/ops/impl/IndexSelect.cpp b/backends/vulkan/runtime/graph/ops/impl/IndexSelect.cpp index 7b4e45262c0..d9a0cdedd79 100644 --- a/backends/vulkan/runtime/graph/ops/impl/IndexSelect.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/IndexSelect.cpp @@ -21,9 +21,9 @@ void check_index_select_args( const api::vTensor& in, const api::vTensor& idx, const api::vTensor& out) { - VK_CHECK_COND(check_memory_layout_is(in, utils::kChannelsPacked)); - VK_CHECK_COND(check_memory_layout_is(idx, utils::kChannelsPacked)); - VK_CHECK_COND(check_memory_layout_is(out, utils::kChannelsPacked)); + VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim)); + VK_CHECK_COND(check_packed_dim_is(idx, WHCN::kChannelsDim)); + VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim)); } void add_index_select_channel_node( diff --git a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp index 63b60bf52f7..b96b8840026 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp @@ -36,7 +36,7 @@ void check_addmm_args( VK_CHECK_COND(mat1_sizes.size() == 2 || mat1_sizes.size() == 3); VK_CHECK_COND(mat1_sizes.size() == mat2_sizes.size()); - VK_CHECK_COND(graph.memory_layout_of(mat1) == graph.memory_layout_of(out)); + VK_CHECK_COND(graph.packed_dim_of(mat1) == graph.packed_dim_of(out)); VK_CHECK_COND(utils::val_at(-1, mat1_sizes) == utils::val_at(-2, mat2_sizes)); @@ -100,27 +100,36 @@ void add_addmm_naive_node( std::string kernel_name = graph.get_bool(mat2_is_transposed) ? "linear_naive" : "addmm_naive"; kernel_name.reserve(kShaderNameReserve); - add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat1)); - add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat2)); + add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); add_dtype_suffix(kernel_name, graph.dtype_of(out)); + utils::uvec3 global_wg_size = graph.logical_limits_of(out); graph.execute_nodes().emplace_back(new ExecuteNode( graph, VK_KERNEL_FROM_STR(kernel_name), - graph.create_global_wg_size(out), - graph.create_local_wg_size(out), + global_wg_size, + graph.create_local_wg_size(global_wg_size), // Inputs and Outputs {{out, vkapi::MemoryAccessType::WRITE}, {{mat1, mat2, self}, vkapi::MemoryAccessType::READ}}, // Shader params buffers { - graph.texture_limits_ubo(out), + graph.sizes_ubo(out), + graph.logical_limits_ubo(out), + graph.axis_map_ubo(out), graph.sizes_ubo(mat1), + graph.axis_map_ubo(mat1), + graph.sizes_ubo(mat2), + graph.axis_map_ubo(mat2), graph.sizes_ubo(self), + graph.axis_map_ubo(self), graph.create_params_buffer(params), }, // Specialization Constants - {}, + {graph.packed_dim_of(out), + graph.packed_dim_of(mat1), + graph.packed_dim_of(mat2), + graph.packed_dim_of(self)}, // Resizing Logic resize_addmm_node, {mat2_is_transposed})); @@ -151,7 +160,7 @@ void add_addmm_optimized_node( ValueRef mat2_packed = mat2; const utils::GPUMemoryLayout mat2_layout = mat2_is_transposed_val ? utils::kWidthPacked : utils::kHeightPacked; - if (graph.memory_layout_of(mat2) != mat2_layout) { + if (graph.estimate_memory_layout_of(mat2) != mat2_layout) { mat2_packed = graph.add_tensor_like(mat2, mat2_layout); viewFn(graph, {mat2, graph.add_none(), mat2_packed}); } @@ -173,11 +182,20 @@ void add_addmm_optimized_node( add_dtype_suffix(kernel_name, graph.dtype_of(out)); - utils::uvec3 global_size; + utils::uvec3 global_size = graph.logical_limits_of(out); + + // Each thread computes a W=(2/4) x H=4 x C=(1/4) output tile. Therefore, the + // total number of threads is W/(2 or 4) x H/4 x C/1. Since the out tensor is + // channels packed, C does not need to be divided by 4. The "identity" of each + // thread is the (x, y, z) coordinate of the output tile it is computing, and + // this identity can be used to compute the tensor index of the top left + // element in the tile, which will be [W=x*(2 or 4), H=y*4, C=z*(1 or 4), N=0] if (mat1_sizes.at(mat1_dims - 2) < 8) { - global_size = utils::divup_vec(graph.image_extents_of(out), {4, 2, 1}); + // Use `logical_extents` instead of `image_extents` because the workgroup + // axes need to correspond to tensor dimensions. + global_size = utils::divup_vec(global_size, {4, 2, 1}); } else { - global_size = utils::divup_vec(graph.image_extents_of(out), {4, 4, 1}); + global_size = utils::divup_vec(global_size, {4, 4, 1}); } utils::uvec3 local_size = adaptive_work_group_size(global_size); @@ -191,14 +209,18 @@ void add_addmm_optimized_node( {{mat1_W_packed, mat2_packed, self}, vkapi::MemoryAccessType::READ}}, // Shader params buffers { - graph.texture_limits_ubo(out), graph.sizes_ubo(out), + graph.axis_map_ubo(out), + graph.sizes_ubo(mat1_W_packed), + graph.axis_map_ubo(mat1_W_packed), + graph.sizes_ubo(mat2_packed), + graph.axis_map_ubo(mat2_packed), graph.sizes_ubo(self), - graph.texture_limits_ubo(mat1_W_packed), + graph.axis_map_ubo(self), graph.create_params_buffer(params), }, // Specialization Constants - {}, + {graph.packed_dim_of(out)}, // Resizing Logic resize_addmm_node, {mat2_is_transposed})); @@ -224,10 +246,10 @@ void add_addmm_node( } Params params = {alpha_val, beta_val}; - if (graph.memory_layout_of(mat1) == utils::kChannelsPacked) { + if (graph.packed_dim_of(mat1) == WHCN::kChannelsDim) { add_addmm_optimized_node( graph, self, mat1, mat2, beta, alpha, out, params, mat2_is_transposed); - } else if (graph.memory_layout_of(mat1) == utils::kWidthPacked) { + } else if (graph.packed_dim_of(mat1) == WHCN::kWidthDim) { add_addmm_naive_node( graph, self, mat1, mat2, beta, alpha, out, params, mat2_is_transposed); } else { diff --git a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp index 2d9346e1340..1034dc445e8 100644 --- a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp @@ -29,7 +29,7 @@ void check_matmul_args( VK_CHECK_COND(mat1_sizes.size() == 2 || mat1_sizes.size() == 3); VK_CHECK_COND(mat1_sizes.size() == mat2_sizes.size()); - VK_CHECK_COND(graph.memory_layout_of(mat1) == graph.memory_layout_of(out)); + VK_CHECK_COND(graph.packed_dim_of(mat1) == graph.packed_dim_of(out)); VK_CHECK_COND(utils::val_at(-1, mat1_sizes) == utils::val_at(-2, mat2_sizes)); } @@ -116,25 +116,31 @@ void add_matmul_naive_texture3d_node( : "matmul_naive"; kernel_name.reserve(kShaderNameReserve); add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); - add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat1)); - add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat2)); add_dtype_suffix(kernel_name, graph.dtype_of(out)); + utils::uvec3 global_wg_size = graph.logical_limits_of(out); graph.execute_nodes().emplace_back(new ExecuteNode( graph, VK_KERNEL_FROM_STR(kernel_name), - graph.create_global_wg_size(out), - graph.create_local_wg_size(out), + global_wg_size, + graph.create_local_wg_size(global_wg_size), // Inputs and Outputs {{out, vkapi::MemoryAccessType::WRITE}, {{mat1, mat2}, vkapi::MemoryAccessType::READ}}, // Shader params buffers { - graph.texture_limits_ubo(out), + graph.sizes_ubo(out), + graph.logical_limits_ubo(out), + graph.axis_map_ubo(out), graph.sizes_ubo(mat1), + graph.axis_map_ubo(mat1), + graph.sizes_ubo(mat2), + graph.axis_map_ubo(mat2), }, // Specialization Constants - {}, + {graph.packed_dim_of(out), + graph.packed_dim_of(mat1), + graph.packed_dim_of(mat2)}, // Resizing Logic resize_matmul_node, {mat2_is_transposed})); @@ -159,7 +165,7 @@ void add_matmul_optimized_node( ValueRef mat2_packed = mat2; const utils::GPUMemoryLayout mat2_layout = mat2_is_transposed_val ? utils::kWidthPacked : utils::kHeightPacked; - if (graph.memory_layout_of(mat2) != mat2_layout) { + if (graph.estimate_memory_layout_of(mat2) != mat2_layout) { mat2_packed = graph.add_tensor_like(mat2, mat2_layout); viewFn(graph, {mat2, graph.add_none(), mat2_packed}); } @@ -181,12 +187,21 @@ void add_matmul_optimized_node( add_dtype_suffix(kernel_name, graph.dtype_of(out)); - utils::uvec3 global_size; + // Each thread computes a W=(2/4) x H=4 x C=(1/4) output tile. Therefore, the + // total number of threads is W/(2 or 4) x H/4 x C/1. Since the out tensor is + // channels packed, C does not need to be divided by 4. The "identity" of each + // thread is the (x, y, z) coordinate of the output tile it is computing, and + // this identity can be used to compute the tensor index of the top left + // element in the tile, which will be [W=x*(2 or 4), H=y*4, C=z*(1 or 4), N=0] + utils::uvec3 global_size = graph.logical_limits_of(out); if (mat1_sizes.at(mat1_dims - 2) < 8) { - global_size = utils::divup_vec(graph.image_extents_of(out), {4, 2, 1}); + // Use `logical_extents` instead of `image_extents` because the workgroup + // axes need to correspond to tensor dimensions. + global_size = utils::divup_vec(global_size, {4, 2, 1}); } else { - global_size = utils::divup_vec(graph.image_extents_of(out), {4, 4, 1}); + global_size = utils::divup_vec(global_size, {4, 4, 1}); } + utils::uvec3 local_size = adaptive_work_group_size(global_size); graph.execute_nodes().emplace_back(new ExecuteNode( @@ -199,12 +214,15 @@ void add_matmul_optimized_node( {{mat1_W_packed, mat2_packed}, vkapi::MemoryAccessType::READ}}, // Shader params buffers { - graph.texture_limits_ubo(out), graph.sizes_ubo(out), - graph.texture_limits_ubo(mat1_W_packed), + graph.axis_map_ubo(out), + graph.sizes_ubo(mat1_W_packed), + graph.axis_map_ubo(mat1_W_packed), + graph.sizes_ubo(mat2_packed), + graph.axis_map_ubo(mat2_packed), }, // Specialization Constants - {}, + {graph.packed_dim_of(out)}, // Resizing Logic resize_matmul_node, {mat2_is_transposed})); @@ -219,13 +237,13 @@ void add_matmul_node( if (graph.is_buffer_storage(out)) { add_matmul_naive_buffer_node( graph, mat1, mat2_data, out, mat2_is_transposed); - } else if (graph.memory_layout_of(mat1) == utils::kChannelsPacked) { + } else if (graph.packed_dim_of(mat1) == WHCN::kChannelsDim) { add_matmul_optimized_node(graph, mat1, mat2_data, out, mat2_is_transposed); - } else if (graph.memory_layout_of(mat1) == utils::kWidthPacked) { + } else if (graph.packed_dim_of(mat1) == WHCN::kWidthDim) { add_matmul_naive_texture3d_node( graph, mat1, mat2_data, out, mat2_is_transposed); } else { - VK_THROW("Input should be channel packed or width packed."); + VK_THROW("Input texture should be channel packed or width packed."); } } diff --git a/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp b/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp index 2b15d924706..553075fc4bb 100644 --- a/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp @@ -49,8 +49,8 @@ void resize_native_layer_norm_node( } void check_args(const api::vTensor& in, const api::vTensor& out) { - VK_CHECK_COND(check_memory_layout_is(in, utils::kChannelsPacked)); - VK_CHECK_COND(check_memory_layout_is(out, utils::kChannelsPacked)); + VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim)); + VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim)); } void add_native_layer_norm_node( @@ -76,10 +76,10 @@ void add_native_layer_norm_node( } ValueRef arg_in = prepack_if_tensor_ref(graph, in); - ValueRef arg_weight = - prepack_if_tensor_ref(graph, weight, graph.memory_layout_of(arg_in)); - ValueRef arg_bias = - prepack_if_tensor_ref(graph, bias, graph.memory_layout_of(arg_in)); + ValueRef arg_weight = prepack_if_tensor_ref( + graph, weight, graph.estimate_memory_layout_of(arg_in)); + ValueRef arg_bias = prepack_if_tensor_ref( + graph, bias, graph.estimate_memory_layout_of(arg_in)); const auto out_val = graph.get_value_list(out); vTensorPtr t_out = graph.get_tensor(out_val->at(0)); @@ -91,7 +91,7 @@ void add_native_layer_norm_node( std::vector in_sizes = t_input->sizes(); - utils::uvec3 global_size = t_mean->image_extents(); + utils::uvec3 global_size = t_mean->logical_limits(); utils::uvec3 local_size = adaptive_work_group_size(global_size); std::string kernel_name("native_layer_norm"); @@ -109,7 +109,7 @@ void add_native_layer_norm_node( vkapi::MemoryAccessType::WRITE}, {{arg_in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}}, // Shader params buffers - {t_out->texture_limits_ubo(), + {t_out->logical_limits_ubo(), t_out->sizes_ubo(), graph.create_params_buffer(epsilon)}, // Specialization Constants diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp index e78fca15a0a..e45a333123d 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp @@ -28,8 +28,8 @@ void check_args( const api::vTensor& in, const std::vector& permute_dims, const api::vTensor& out) { - VK_CHECK_COND(check_memory_layout_is(in, utils::kChannelsPacked)); - VK_CHECK_COND(check_memory_layout_is(out, utils::kChannelsPacked)); + VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim)); + VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim)); // This implementation doesn't not requires the input tensor to have the same // dim size as the argument. The code will work as long as the input tensor's @@ -90,7 +90,7 @@ void add_permute_node( graph.create_local_wg_size(out), {{out, vkapi::MemoryAccessType::WRITE}, {in, vkapi::MemoryAccessType::READ}}, - {t_out->texture_limits_ubo(), + {t_out->logical_limits_ubo(), t_out->sizes_ubo(), graph.create_params_buffer(params)}, // Specialization Constants diff --git a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp index 8b477d3a31a..ba8d971a1af 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp @@ -18,8 +18,8 @@ namespace vkcompute { void check_pool2d_args(const api::vTensor& in, const api::vTensor& out) { - VK_CHECK_COND(check_memory_layout_is(in, utils::kChannelsPacked)); - VK_CHECK_COND(check_memory_layout_is(out, utils::kChannelsPacked)); + VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim)); + VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim)); } void resize_pool2d_node( @@ -79,7 +79,7 @@ void add_max_pool2d_node( check_pool2d_args(*t_in, *t_out); - utils::uvec3 global_size = t_out->image_extents(); + utils::uvec3 global_size = t_out->logical_limits(); utils::uvec3 local_size = adaptive_work_group_size(global_size); std::string kernel_name("max_pool2d"); @@ -103,7 +103,7 @@ void add_max_pool2d_node( {arg, vkapi::MemoryAccessType::READ}}, // Shader params buffers { - t_out->texture_limits_ubo(), + t_out->logical_limits_ubo(), t_in->sizes_ubo(), graph.create_params_buffer(kernel_params), }, @@ -155,7 +155,7 @@ void add_avg_pool2d_node( check_pool2d_args(*t_in, *t_out); - utils::uvec3 global_size = t_out->image_extents(); + utils::uvec3 global_size = t_out->logical_limits(); utils::uvec3 local_size = adaptive_work_group_size(global_size); std::string kernel_name("avg_pool2d"); @@ -176,7 +176,7 @@ void add_avg_pool2d_node( {{out, vkapi::MemoryAccessType::WRITE}, {arg, vkapi::MemoryAccessType::READ}}, // Shader params buffers - {t_out->texture_limits_ubo(), + {t_out->logical_limits_ubo(), t_in->sizes_ubo(), graph.create_params_buffer(kernel_params), graph.create_params_buffer(divisor_params)}, diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp index 732643ef754..28bf6513957 100644 --- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp @@ -30,7 +30,7 @@ void check_qlinear_args( VK_CHECK_COND(qmat2_sizes.size() == 2); VK_CHECK_COND(scales_sizes.size() == 1); - VK_CHECK_COND(graph.memory_layout_of(mat1) == graph.memory_layout_of(out)); + VK_CHECK_COND(graph.packed_dim_of(mat1) == graph.packed_dim_of(out)); VK_CHECK_COND( utils::val_at(-1, mat1_sizes) == utils::val_at(-1, qmat2_sizes)); @@ -78,8 +78,8 @@ void add_q_8w_linear_node( std::string kernel_name = "q_8w_linear"; kernel_name.reserve(kShaderNameReserve); - add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat1)); - add_memory_layout_suffix(kernel_name, graph.memory_layout_of(q_mat2)); + add_packed_dim_suffix(kernel_name, graph.packed_dim_of(mat1)); + add_packed_dim_suffix(kernel_name, graph.packed_dim_of(q_mat2)); add_dtype_suffix(kernel_name, graph.dtype_of(out)); add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); @@ -94,7 +94,7 @@ void add_q_8w_linear_node( graph.strides_ubo(q_mat2), graph.strides_ubo(scales)}); } else { - ubos.append({graph.texture_limits_ubo(out), graph.sizes_ubo(mat1)}); + ubos.append({graph.logical_limits_ubo(out), graph.sizes_ubo(mat1)}); } graph.execute_nodes().emplace_back(new ExecuteNode( diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedMatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedMatMul.cpp new file mode 100644 index 00000000000..17bd62ad6ea --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedMatMul.cpp @@ -0,0 +1,184 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include + +namespace vkcompute { + +void check_q_matmul_args( + ComputeGraph& graph, + const ValueRef mat1, + const ValueRef mat2_data, + const ValueRef group_size_data, + const ValueRef scales_and_zeros, + const ValueRef out) { + const std::vector mat1_sizes = graph.sizes_of(mat1); + const std::vector mat2_sizes = graph.sizes_of(mat2_data); + const std::vector scales_and_zeros_sizes = + graph.sizes_of(scales_and_zeros); + + const uint32_t group_size = graph.extract_scalar(group_size_data); + + VK_CHECK_COND(mat1_sizes.size() == 2); + VK_CHECK_COND(mat1_sizes.size() == mat2_sizes.size()); + + using namespace WHCN; + VK_CHECK_COND(graph.packed_dim_of(mat1) == kWidthDim); + VK_CHECK_COND(graph.packed_dim_of(mat2_data) == kWidthDim); + // VK_CHECK_COND(graph.packed_dim_of(scales_and_zeros) == kWidthDim); + + if (graph.storage_type_of(scales_and_zeros) == utils::kBuffer) { + VK_CHECK_COND(graph.packed_dim_of(scales_and_zeros) == kWidthDim); + } else { + VK_CHECK_COND(graph.packed_dim_of(scales_and_zeros) == kChannelsDim); + } + + if (graph.storage_type_of(out) == utils::kBuffer) { + VK_CHECK_COND(graph.packed_dim_of(out) == kWidthDim); + } else { + VK_CHECK_COND(graph.packed_dim_of(out) == kChannelsDim); + } + + const int mat1_K = utils::val_at(-1, mat1_sizes); + const int mat2_K = utils::val_at(-1, mat2_sizes) * 2; + const int N = utils::val_at(-2, mat2_sizes); + + VK_CHECK_COND(mat1_K == mat2_K); + + VK_CHECK_COND(mat2_K % group_size == 0); + + const uint32_t k_groups = mat2_K / group_size; + + VK_CHECK_COND(scales_and_zeros_sizes.size() == 3); + VK_CHECK_COND(utils::val_at(-1, scales_and_zeros_sizes) == k_groups); + VK_CHECK_COND(utils::val_at(-2, scales_and_zeros_sizes) == N); + VK_CHECK_COND(utils::val_at(-3, scales_and_zeros_sizes) == 2); + + // Match https://fburl.com/code/6ostkknm + std::vector valid_group_sizes = {32, 64, 128, 256}; + + bool is_valid_group_size = false; + for (auto valid_group_size : valid_group_sizes) { + if (group_size == valid_group_size) { + is_valid_group_size = true; + break; + } + } + + VK_CHECK_COND(is_valid_group_size); +} + +void resize_q_matmul_node( + ComputeGraph* graph, + const std::vector& args, + const std::vector& extra_args) { + (void)extra_args; + + vTensorPtr out = graph->get_tensor(args[0].refs[0]); + vTensorPtr mat1 = graph->get_tensor(args[1].refs[0]); + vTensorPtr mat2 = graph->get_tensor(args[1].refs[1]); + + const int out_cols = utils::val_at(-2, mat1->sizes()); + const int out_rows = utils::val_at(-2, mat2->sizes()); + + std::vector new_out_sizes(3); + if (mat1->sizes().size() == 2) { + new_out_sizes.resize(2); + new_out_sizes.at(0) = out_cols; + new_out_sizes.at(1) = out_rows; + } else { + new_out_sizes.at(0) = mat1->sizes().at(0); + new_out_sizes.at(1) = out_cols; + new_out_sizes.at(2) = out_rows; + } + + out->virtual_resize(new_out_sizes); +} + +void add_q_matmul_node( + ComputeGraph& graph, + const ValueRef mat1, + const ValueRef mat2_data, + const ValueRef group_size, + const ValueRef scales_and_zeros_data, + const ValueRef out) { + auto storage_type = graph.storage_type_of(out); + + ValueRef mat2 = + prepack_buffer_if_tensor_ref(graph, mat2_data, utils::kWidthPacked); + + ValueRef scales_and_zeros = + prepack_if_tensor_ref(graph, scales_and_zeros_data, utils::kWidthPacked); + + std::string kernel_name = "q_4w_linear"; + + add_dtype_suffix(kernel_name, graph.dtype_of(out)); + add_storage_type_suffix(kernel_name, storage_type); + + const uint32_t group_size_val = graph.extract_scalar(group_size); + + vkapi::ParamsBindList ubos({}); + if (storage_type == utils::kBuffer) { + ubos.append(graph.sizes_ubo(out)); + ubos.append(graph.strides_ubo(out)); + ubos.append(graph.sizes_ubo(mat1)); + ubos.append(graph.strides_ubo(mat1)); + ubos.append(graph.strides_ubo(mat2)); + ubos.append(graph.strides_ubo(scales_and_zeros)); + } else { + ubos.append(graph.sizes_ubo(out)); + ubos.append(graph.sizes_ubo(mat1)); + ubos.append(graph.strides_ubo(mat2)); + ubos.append(graph.strides_ubo(scales_and_zeros)); + } + + auto out_sizes = graph.sizes_of(out); + uint32_t N = utils::val_at(-1, out_sizes); + uint32_t M = utils::val_at(-2, out_sizes); + + utils::uvec3 global_wg_size = {N, M, 1}; + + utils::uvec3 local_wg_size = adaptive_work_group_size(global_wg_size); + + graph.execute_nodes().emplace_back(new ExecuteNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + global_wg_size, + local_wg_size, + // Inputs and Outputs + {{out, vkapi::MemoryAccessType::WRITE}, + {{mat1, mat2, scales_and_zeros}, vkapi::MemoryAccessType::READ}}, + // Shader params buffers + ubos, + // Specialization Constants + {SV(group_size_val)}, + // Resizing Logic + resize_q_matmul_node, + {})); +} + +void int4pack_mm(ComputeGraph& graph, const std::vector& args) { + check_q_matmul_args(graph, args[0], args[1], args[2], args[3], args[4]); + return add_q_matmul_node( + graph, + args[0], // mat1 + args[1], // mat2 + args[2], // group_size + args[3], // scales_and_zeros + args[4] // out + ); +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(aten._weight_int4pack_mm.default, int4pack_mm); +} + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp index 3ef80dc49c7..741b65a84f0 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp @@ -23,8 +23,8 @@ void check_args( const api::vTensor& in, const std::vector& repeats, const api::vTensor& out) { - VK_CHECK_COND(check_memory_layout_is(in, utils::kChannelsPacked)); - VK_CHECK_COND(check_memory_layout_is(out, utils::kChannelsPacked)); + VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim)); + VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim)); int64_t in_dim = in.dim(); VK_CHECK_COND( @@ -108,7 +108,7 @@ void add_repeat_channel_node( // Parameter buffers {graph.create_params_buffer(repeat_channel_args)}, // Specialization Constants - {SV(t_out->packed_dim_whcn_idx())})); + {SV(t_out->packed_dim())})); } void add_repeat_node( @@ -130,7 +130,7 @@ void add_repeat_node( // After expanding a dimension, we will update the "running_range" since we // will need to copy the "expanded" area. - utils::ivec3 running_range = t_in->texture_limits(); + utils::ivec3 running_range = t_in->logical_limits(); const std::vector& in_sizes = t_in->sizes(); diff --git a/backends/vulkan/runtime/graph/ops/impl/Select.cpp b/backends/vulkan/runtime/graph/ops/impl/Select.cpp index 351db0d192b..b2f2245f648 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Select.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Select.cpp @@ -22,8 +22,8 @@ void check_args( int64_t dim, int64_t index, const api::vTensor& t_out) { - VK_CHECK_COND(check_memory_layout_is(t_in, utils::kChannelsPacked)); - VK_CHECK_COND(check_memory_layout_is(t_out, utils::kChannelsPacked)); + VK_CHECK_COND(check_packed_dim_is(t_in, WHCN::kChannelsDim)); + VK_CHECK_COND(check_packed_dim_is(t_out, WHCN::kChannelsDim)); const int64_t in_dim = t_in.dim(); VK_CHECK_COND( @@ -112,7 +112,7 @@ void add_select_int_node( {{out, vkapi::MemoryAccessType::WRITE}, {in, vkapi::MemoryAccessType::READ}}, // Parameter buffers - {t_out->texture_limits_ubo(), + {t_out->logical_limits_ubo(), t_out->sizes_ubo(), // TODO: num_batches and num_texel_per_batch are provided by // t_out->sizes. Can change the following to reduce params diff --git a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp index 8b323bafedd..21e6549513d 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp @@ -10,6 +10,8 @@ #include +#include + #include #include #include @@ -42,8 +44,8 @@ void add_slice_tensor_out_node( vTensorPtr t_in = graph.get_tensor(in); vTensorPtr t_out = graph.get_tensor(out); - VK_CHECK_COND(check_memory_layout_is(*t_in, utils::kChannelsPacked)); - VK_CHECK_COND(check_memory_layout_is(*t_out, utils::kChannelsPacked)); + VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim)); + VK_CHECK_COND(check_packed_dim_is(*t_out, WHCN::kChannelsDim)); // Need normalize the dim int64_t dim = graph.extract_scalar(dim_ref); @@ -123,7 +125,7 @@ void add_slice_tensor_out_node( kernel_name.reserve(kShaderNameReserve); add_dtype_suffix(kernel_name, *t_out); - utils::uvec3 global_size = t_out->image_extents(); + utils::uvec3 global_size = t_out->logical_limits(); utils::uvec3 local_size = adaptive_work_group_size(global_size); const struct Block final { @@ -149,6 +151,124 @@ void add_slice_tensor_out_node( } } +std::vector get_slice_sizes( + ComputeGraph& graph, + ValueRef in_ref, + ValueRef dim_ref, + ValueRef opt_start_ref, + ValueRef opt_end_ref) { + const int64_t dim = graph.extract_scalar(dim_ref); + std::optional opt_start = + graph.extract_optional_scalar(opt_start_ref); + std::optional opt_end = + graph.extract_optional_scalar(opt_end_ref); + + int64_t dim_size = graph.size_at(dim, in_ref); + int64_t start = opt_start.value_or(0); + int64_t end = opt_end.value_or(dim_size); + + start = normalize_idx(start, dim_size, 0); + end = normalize_idx(end, dim_size, dim_size); + + std::vector new_out_sizes = graph.sizes_of(in_ref); + new_out_sizes.at(dim) = end - start; + + return new_out_sizes; +} + +void resize_slice_view_node( + ComputeGraph* graph, + const std::vector& args, + const std::vector& extra_args) { + (void)args; + vTensorPtr out = graph->get_tensor(extra_args[0]); + + std::vector new_out_sizes = get_slice_sizes( + *graph, + extra_args[1], // input + extra_args[2], // dim + extra_args[3], // optional start + extra_args[4]); // optional end + + out->virtual_resize(new_out_sizes); +} + +void check_slice_view_args( + ComputeGraph& graph, + ValueRef in_ref, + ValueRef dim_ref, + ValueRef opt_start_ref, + ValueRef opt_end_ref, + ValueRef opt_step_ref, + ValueRef out_ref) { + VK_CHECK_COND( + graph.val_is_view_of(out_ref, in_ref), + "output must be a view of the input"); + + const int64_t dim = graph.extract_scalar(dim_ref); + const int64_t dim_size = graph.size_at(dim, in_ref); + + int64_t start = + graph.extract_optional_scalar(opt_start_ref).value_or(0); + int64_t end = graph.extract_optional_scalar(opt_end_ref).value_or(0); + int64_t step = + graph.extract_optional_scalar(opt_step_ref).value_or(1); + + start = normalize_idx(start, dim_size, 0); + end = normalize_idx(end, dim_size, dim_size); + + // The start idx must be 0; this is to ensure that the start of the slice view + // does not have any offset with respect to the base buffer storage. If the + // offset is nonzero, then it will potentially change upon a resize; however + // the buffer offset of the view tensor will have been "locked in" when the + // descriptor for its buffer storage is bound to a compute shader. Therefore + // there is no way to update the offset of the view once it has been bound. + VK_CHECK_COND(start == 0, "start must be 0 for slice view"); + VK_CHECK_COND(step == 1, "step must be 1 for slice view"); + + VK_CHECK_COND( + end < dim_size, "end must be less than dim size for slice view"); + + // We must also check that all earlier dims in the dim order have a size of 1. + // This ensures that the slice view encompasses a contiguous memory region of + // the source tensor's memory buffer. + std::vector in_sizes = graph.sizes_of(in_ref); + std::vector in_dim_order = graph.dim_order_of(in_ref); + for (int i = 0; i < in_dim_order.size(); ++i) { + if (in_dim_order[i] == dim) { + break; + } + VK_CHECK_COND(in_sizes[in_dim_order[i]] == 1); + } +} + +void add_slice_view_node( + ComputeGraph& graph, + ValueRef in_ref, + ValueRef dim_ref, + ValueRef opt_start_ref, + ValueRef opt_end_ref, + ValueRef opt_step_ref, + ValueRef out_ref) { + check_slice_view_args( + graph, + in_ref, + dim_ref, + opt_start_ref, + opt_end_ref, + opt_step_ref, + out_ref); + + std::vector new_out_sizes = + get_slice_sizes(graph, in_ref, dim_ref, opt_start_ref, opt_end_ref); + + graph.get_tensor(out_ref)->virtual_resize(new_out_sizes); + + graph.execute_nodes().emplace_back(new ExecuteNode( + resize_slice_view_node, + {out_ref, in_ref, dim_ref, opt_start_ref, opt_end_ref, opt_step_ref})); +} + void slice_tensor_out(ComputeGraph& graph, const std::vector& args) { return add_slice_tensor_out_node( graph, @@ -160,9 +280,36 @@ void slice_tensor_out(ComputeGraph& graph, const std::vector& args) { args[5]); } +void slice_tensor(ComputeGraph& graph, const std::vector& args) { + ValueRef in = args[0]; + ValueRef out = args[5]; + + // Special case if out is a view of in + if (graph.val_is_view_of(out, in)) { + add_slice_view_node( + graph, + in, + args[1], // dim + args[2], // optional start + args[3], // optional end + args[4], // step + out); + return; + } + + add_slice_tensor_out_node( + graph, + in, + args[1], // dim + args[2], // optional start + args[3], // optional end + args[4], // step + out); +} + REGISTER_OPERATORS { VK_REGISTER_OP(aten.slice_copy.Tensor, slice_tensor_out); - VK_REGISTER_OP(aten.slice.Tensor, slice_tensor_out); + VK_REGISTER_OP(aten.slice.Tensor, slice_tensor); } } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Slice.h b/backends/vulkan/runtime/graph/ops/impl/Slice.h new file mode 100644 index 00000000000..220066ff1bb --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/Slice.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include + +#include + +namespace vkcompute { + +void add_slice_view_node( + ComputeGraph& graph, + ValueRef in_ref, + ValueRef dim_ref, + ValueRef opt_start_ref, + ValueRef opt_end_ref, + ValueRef opt_step_ref, + ValueRef out_ref); + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp b/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp index fa4d3df944f..dd2fb43e656 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp @@ -64,7 +64,7 @@ void add_softmax_node( {{out, vkapi::MemoryAccessType::WRITE}, {in_arg, vkapi::MemoryAccessType::READ}}, // Shader params buffers - {t_out->texture_limits_ubo(), + {t_out->logical_limits_ubo(), t_in->sizes_ubo(), graph.create_params_buffer(utils::make_ivec2({in_dim, softmax_dim}))}, // Specialization Constants diff --git a/backends/vulkan/runtime/graph/ops/impl/Split.cpp b/backends/vulkan/runtime/graph/ops/impl/Split.cpp index e093ccf1b72..39039e51025 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Split.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Split.cpp @@ -25,7 +25,7 @@ void add_split_with_sizes_default_node( ValueRef out_list_ref) { vTensorPtr t_in = graph.get_tensor(in); - VK_CHECK_COND(check_memory_layout_is(*t_in, utils::kChannelsPacked)); + VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim)); ValueListPtr out_list = graph.get_value_list(out_list_ref); @@ -38,7 +38,7 @@ void add_split_with_sizes_default_node( ValueRef out_ref = (*out_list)[split_idx]; vTensorPtr t_out = graph.get_tensor(out_ref); - VK_CHECK_COND(check_memory_layout_is(*t_out, utils::kChannelsPacked)); + VK_CHECK_COND(check_packed_dim_is(*t_out, WHCN::kChannelsDim)); VK_CHECK_COND(dim_at(*t_out, dim_index) == split_size); } @@ -50,7 +50,7 @@ void add_split_with_sizes_default_node( // Doesn't need to use split_size since we have already verified that the // output tensor's size matches with the split_size. vTensorPtr t_out = graph.get_tensor(out_ref); - utils::ivec3 range = t_out->texture_limits(); + utils::ivec3 range = t_out->logical_limits(); add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref); src_offset[0] += range[0]; @@ -61,7 +61,7 @@ void add_split_with_sizes_default_node( for (ValueRef out_ref : *out_list) { vTensorPtr t_out = graph.get_tensor(out_ref); - utils::ivec3 range = t_out->texture_limits(); + utils::ivec3 range = t_out->logical_limits(); add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref); src_offset[1] += range[1]; @@ -72,7 +72,7 @@ void add_split_with_sizes_default_node( for (ValueRef out_ref : *out_list) { vTensorPtr t_out = graph.get_tensor(out_ref); - utils::ivec3 range = t_out->texture_limits(); + utils::ivec3 range = t_out->logical_limits(); add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref); src_offset[2] += range[2]; diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp index b02613c2083..ef6e8347df8 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp @@ -31,7 +31,7 @@ void add_staging_to_tensor_node( graph.strides_ubo(out_tensor), graph.numel_ubo(out_tensor)}); } else { - ubos.append(graph.sizes_ubo(out_tensor)); + ubos.append({graph.sizes_ubo(out_tensor), graph.axis_map_ubo(out_tensor)}); } graph.execute_nodes().emplace_back(new ExecuteNode( @@ -45,7 +45,7 @@ void add_staging_to_tensor_node( // Parameter Buffers ubos, // Specialization Constants - {SV(graph.packed_dim_whcn_idx_of(out_tensor))}, + {SV(graph.packed_dim_of(out_tensor))}, // Resizing Logic nullptr, {})); @@ -69,7 +69,7 @@ void add_tensor_to_staging_node( graph.strides_ubo(in_tensor), graph.numel_ubo(in_tensor)}); } else { - ubos.append(graph.sizes_ubo(in_tensor)); + ubos.append({graph.sizes_ubo(in_tensor), graph.axis_map_ubo(in_tensor)}); } // Normally, the image_to_nchw shader is structured so that each thread reads @@ -97,7 +97,7 @@ void add_tensor_to_staging_node( // Parameter Buffers ubos, // Specialization Constants - {SV(graph.packed_dim_whcn_idx_of(in_tensor))})); + {SV(graph.packed_dim_of(in_tensor))})); } ValueRef prepack( @@ -113,7 +113,7 @@ ValueRef prepack( if (graph.is_buffer_storage(v)) { ubos.append({graph.sizes_ubo(v), graph.strides_ubo(v), graph.numel_ubo(v)}); } else { - ubos.append(graph.sizes_ubo(v)); + ubos.append({graph.sizes_ubo(v), graph.axis_map_ubo(v)}); } graph.prepack_nodes().emplace_back(new PrepackNode( @@ -127,7 +127,34 @@ ValueRef prepack( // Parameter Buffers ubos, // Specialization Constants - {SV(graph.packed_dim_whcn_idx_of(v))})); + {SV(graph.packed_dim_of(v))})); + + return v; +} + +ValueRef prepack_buffer( + ComputeGraph& graph, + const ValueRef vref, + const utils::GPUMemoryLayout layout) { + ValueRef v = graph.add_tensor_like(vref, utils::kBuffer, layout); + + vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR("buffer_to_buffer"); + + vkapi::ParamsBindList ubos; + ubos.append({graph.numel_ubo(v)}); + + graph.prepack_nodes().emplace_back(new PrepackNode( + graph, + shader, + graph.create_global_wg_size(v), + graph.create_local_wg_size(v), + // Input and Outputs + vref, + v, + // Parameter Buffers + ubos, + // Specialization Constants + {})); return v; } @@ -143,6 +170,17 @@ ValueRef prepack_if_tensor_ref( } } +ValueRef prepack_buffer_if_tensor_ref( + ComputeGraph& graph, + const ValueRef v, + const utils::GPUMemoryLayout layout) { + if (graph.val_is_tref(v)) { + return prepack_buffer(graph, v, layout); + } else { + return v; + } +} + ValueRef prepack_if_tensor_ref(ComputeGraph& graph, const ValueRef v) { if (graph.val_is_tref(v)) { utils::GPUMemoryLayout layout = @@ -153,4 +191,14 @@ ValueRef prepack_if_tensor_ref(ComputeGraph& graph, const ValueRef v) { } } +ValueRef prepack_buffer_if_tensor_ref(ComputeGraph& graph, const ValueRef v) { + if (graph.val_is_tref(v)) { + utils::GPUMemoryLayout layout = + graph.suggested_memory_layout(graph.get_tref(v)->sizes); + return prepack_buffer(graph, v, layout); + } else { + return v; + } +} + } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.h b/backends/vulkan/runtime/graph/ops/impl/Staging.h index fc875de80dd..88a9630239a 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Staging.h +++ b/backends/vulkan/runtime/graph/ops/impl/Staging.h @@ -29,6 +29,13 @@ ValueRef prepack_if_tensor_ref( const ValueRef v, const utils::GPUMemoryLayout layout); +ValueRef prepack_buffer_if_tensor_ref( + ComputeGraph& graph, + const ValueRef v, + const utils::GPUMemoryLayout layout); + ValueRef prepack_if_tensor_ref(ComputeGraph& graph, const ValueRef v); +ValueRef prepack_buffer_if_tensor_ref(ComputeGraph& graph, const ValueRef v); + } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Sum.cpp b/backends/vulkan/runtime/graph/ops/impl/Sum.cpp index b466f404ad1..c0ce9e4f2c4 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Sum.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Sum.cpp @@ -47,8 +47,8 @@ void resize_sum_node( } void check_sum_args(const api::vTensor& in, const api::vTensor& out) { - VK_CHECK_COND(check_memory_layout_is(in, utils::kChannelsPacked)); - VK_CHECK_COND(check_memory_layout_is(out, utils::kChannelsPacked)); + VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim)); + VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim)); } void add_sum_dim_node( @@ -85,7 +85,7 @@ void add_sum_dim_node( {{out, vkapi::MemoryAccessType::WRITE}, {arg, vkapi::MemoryAccessType::READ}}, // Shader params buffers - {t_out->texture_limits_ubo(), + {t_out->logical_limits_ubo(), graph.create_params_buffer(dim + 4 - in_dim), graph.create_params_buffer(dim_size), graph.create_params_buffer(int(ceil(channel / 4.0)))}, diff --git a/backends/vulkan/runtime/graph/ops/impl/Transpose.cpp b/backends/vulkan/runtime/graph/ops/impl/Transpose.cpp index faa99ec1a18..98a104b4b70 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Transpose.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Transpose.cpp @@ -10,45 +10,14 @@ #include -#include +#include + #include #include -#include - namespace vkcompute { -/* - * Transposing for sizes and strides is as simple as swapping the values at - * dim0 and dim1 in the sizes/strides vector. - */ -void swap_vector_inplace( - std::vector& vec, - const int64_t dim0, - const int64_t dim1) { - std::iter_swap(vec.begin() + dim0, vec.begin() + dim1); -} - -/* - * Transposing the dim order is a bit more unintuitive. dim0 and dim1 have - * swapped their "identities", so we need to swap the values of dim0 and dim1 - * wherever they appear in the dim order vector. Compare this to just swapping - * the elements at dim0 and dim1 in the strides or sizes vectors. - */ -void transpose_dim_order_inplace( - std::vector& dim_order, - const int64_t dim0, - const int64_t dim1) { - for (int i = 0; i < dim_order.size(); ++i) { - if (dim_order[i] == dim0) { - dim_order[i] = dim1; - } else if (dim_order[i] == dim1) { - dim_order[i] = dim0; - } - } -} - void resize_transpose_view_node( ComputeGraph* graph, const std::vector& args, @@ -61,12 +30,9 @@ void resize_transpose_view_node( const int64_t dim1 = graph->extract_scalar(extra_args[3]); std::vector new_sizes = in->sizes(); - std::vector new_dim_order = in->dim_order(); - - swap_vector_inplace(new_sizes, dim0, dim1); - transpose_dim_order_inplace(new_dim_order, dim0, dim1); - - out->virtual_reconfigure(new_sizes, new_dim_order); + // Transpose the resized input sizes + std::iter_swap(new_sizes.begin() + dim0, new_sizes.begin() + dim1); + out->virtual_resize(new_sizes); } void check_transpose_view_args( @@ -95,13 +61,8 @@ void add_transpose_view_node( const int64_t dim0 = graph.extract_scalar(dim0_ref); const int64_t dim1 = graph.extract_scalar(dim1_ref); - std::vector new_sizes = graph.sizes_of(input_ref); - std::vector new_dim_order = graph.dim_order_of(input_ref); - - swap_vector_inplace(new_sizes, dim0, dim1); - transpose_dim_order_inplace(new_dim_order, dim0, dim1); - - graph.get_tensor(out_ref)->virtual_reconfigure(new_sizes, new_dim_order); + check_transpose_view_args(graph, input_ref, dim0, dim1, out_ref); + graph.get_tensor(out_ref)->virtual_transpose(dim0, dim1); graph.execute_nodes().emplace_back(new ExecuteNode( resize_transpose_view_node, {out_ref, input_ref, dim0_ref, dim1_ref})); diff --git a/backends/vulkan/runtime/graph/ops/impl/Transpose.h b/backends/vulkan/runtime/graph/ops/impl/Transpose.h new file mode 100644 index 00000000000..a4fc4029222 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/Transpose.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include + +#include + +namespace vkcompute { + +void add_transpose_view_node( + ComputeGraph& graph, + ValueRef input_ref, + ValueRef dim0_ref, + ValueRef dim1_ref, + ValueRef out_ref); + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp index 075c0bc923a..ea27183ead0 100644 --- a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp @@ -46,7 +46,7 @@ void add_unary_op_node( if (graph.is_buffer_storage(out)) { ubos.append({graph.numel_ubo(out)}); } else { - ubos.append({graph.texture_limits_ubo(out)}); + ubos.append({graph.logical_limits_ubo(out)}); } ubos.append( {graph.create_params_buffer(min), graph.create_params_buffer(max)}); @@ -114,12 +114,6 @@ float get_val_or_inf(ComputeGraph& graph, const ValueRef& val, bool max) { "hardshrink"); \ } -#define DEFINE_HARDSWISH_FN(op_name) \ - void op_name(ComputeGraph& graph, const std::vector& args) { \ - return add_unary_op_node( \ - graph, args[0], kDummyFloat, kDummyFloat, args[1], #op_name); \ - } - void gelu(ComputeGraph& graph, const std::vector& args) { // args[1] is the `approximate` string // https://fburl.com/code/9omngmyo @@ -140,7 +134,8 @@ DEFINE_CLAMP_FN(clamp); DEFINE_CLAMP_FN(hardtanh); DEFINE_RELU_FN(relu); DEFINE_HARDSHRINK_FN(hardshrink); -DEFINE_HARDSWISH_FN(hardswish); +DEFINE_ACTIVATION_FN(hardswish); +DEFINE_ACTIVATION_FN(hardsigmoid); REGISTER_OPERATORS { VK_REGISTER_OP(aten.abs.default, abs); @@ -157,6 +152,7 @@ REGISTER_OPERATORS { VK_REGISTER_OP(aten.tanh.default, tanh); VK_REGISTER_OP(aten.hardshrink.default, hardshrink); VK_REGISTER_OP(aten.hardswish.default, hardswish); + VK_REGISTER_OP(aten.hardsigmoid.default, hardsigmoid); } } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Upsample.cpp b/backends/vulkan/runtime/graph/ops/impl/Upsample.cpp index 9183f2aea80..f7fe5282e02 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Upsample.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Upsample.cpp @@ -66,7 +66,7 @@ void add_upsample_nearest2d_node( ValueRef arg_in = prepack_if_tensor_ref(graph, in); vTensorPtr t_in = graph.get_tensor(in); - utils::uvec3 input_sizes = t_in->image_extents(); + utils::uvec3 input_sizes = t_in->logical_limits(); utils::ivec2 input_size = { utils::safe_downcast(input_sizes[0]), @@ -105,7 +105,7 @@ void add_upsample_nearest2d_node( {{out, vkapi::MemoryAccessType::WRITE}, {arg_in, vkapi::MemoryAccessType::READ}}, // Shader params buffers - {t_out->texture_limits_ubo(), + {t_out->logical_limits_ubo(), graph.create_params_buffer(input_size), graph.create_params_buffer(rev_scales)}, // Specialization Constants diff --git a/backends/vulkan/runtime/graph/ops/impl/View.cpp b/backends/vulkan/runtime/graph/ops/impl/View.cpp index 507dbdcf8b1..4832c16ab99 100644 --- a/backends/vulkan/runtime/graph/ops/impl/View.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/View.cpp @@ -76,7 +76,7 @@ void add_view_node( // Parameter Buffers {t_out->sizes_ubo(), t_in->sizes_ubo()}, // Specialization Constants - {SV(t_in->packed_dim_whcn_idx()), SV(t_out->packed_dim_whcn_idx())}, + {SV(t_in->packed_dim()), SV(t_out->packed_dim())}, // Resizing Logic resize_view_node, {sizes})); diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h index 45dfceb3f0d..4bd8e9b900b 100644 --- a/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h +++ b/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h @@ -32,7 +32,8 @@ constexpr DimIndex kChannel4D = DimIndex::DIM_3RD_LAST; constexpr DimIndex kBatch4D = DimIndex::DIM_4TH_LAST; inline DimIndex normalize_to_dim_index(const api::vTensor& v_in, int32_t dim) { - return static_cast(dim - v_in.dim()); + return dim < 0 ? static_cast(dim) + : static_cast(dim - v_in.dim()); } /* diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.cpp b/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.cpp new file mode 100644 index 00000000000..4cf678a9dcb --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.cpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +namespace vkcompute { + +void pack4(const uint8_t* w_ptr, uint8_t* b_ptr, uint32_t N, uint32_t K) { + for (int32_t n = 0; n < N; n++) { + for (int32_t k2 = 0; k2 < K / 2; k2++) { + uint8_t src_val0 = w_ptr[n * K + k2 * 2]; + uint8_t src_val1 = w_ptr[n * K + k2 * 2 + 1]; + b_ptr[n * (K / 2) + k2] = (uint8_t(src_val1) << 4) | uint8_t(src_val0); + } + } +} + +std::vector int4mm_pack_weights( + const std::vector& W_sizes, + const uint8_t* w_ptr) { + const int32_t N = utils::val_at(-1, W_sizes); + const int32_t K = utils::val_at(-2, W_sizes); + + const auto numel = K * N; + std::vector w_ptr_T(numel); + std::vector b_ptr(utils::div_up(numel, 2)); + + // Transpose the weights + for (int32_t k = 0; k < K; k++) { + for (int32_t n = 0; n < N; n++) { + w_ptr_T[n * K + k] = w_ptr[k * N + n]; + } + } + + // Pack two int4s into each int8 + pack4(w_ptr_T.data(), b_ptr.data(), N, K); + + return b_ptr; +} + +std::vector int4mm_dequantize_weights( + const std::vector& W_sizes, + const uint8_t* w_ptr, + const uint32_t group_size, + const float* scales_and_zeros) { + const int64_t N = utils::val_at(-1, W_sizes); + const int64_t K = utils::val_at(-2, W_sizes); + + std::vector w_ptr_deq(K * N); + const int k_groups = K / group_size; + const int zeros_stride = k_groups * N; + + for (int k = 0; k < K; k++) { + for (int n = 0; n < N; n++) { + const int kb = k / group_size; + const int scale_idx = k_groups * n + kb; + const float scale = scales_and_zeros[scale_idx]; + const float zero = + scales_and_zeros[scale_idx + zeros_stride] - scale * 8.0; + w_ptr_deq[k * N + n] = w_ptr[k * N + n] * scale + zero; + } + } + + return w_ptr_deq; +} + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.h new file mode 100644 index 00000000000..4c4cf26d504 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace vkcompute { + +std::vector int4mm_pack_weights( + const std::vector& W_sizes, + const uint8_t* w_ptr); + +std::vector int4mm_dequantize_weights( + const std::vector& W_sizes, + const uint8_t* w_ptr, + const uint32_t group_size, + const float* scales_and_zeros); + +} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp index 2737a86a1ab..9d010c794ec 100644 --- a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp @@ -45,28 +45,26 @@ bool check_same_sizes_at( return utils::val_at(d1, t1.sizes()) == utils::val_at(d2, t2.sizes()); } -bool check_memory_layout_is( - const api::vTensor& t, - utils::GPUMemoryLayout layout) { - return t.gpu_memory_layout() == layout; +bool check_packed_dim_is(const api::vTensor& t, const int32_t packed_dim) { + return t.packed_dim() == packed_dim; } bool check_same_ndim(const api::vTensor& t1, const api::vTensor& t2) { return t1.sizes().size() == t2.sizes().size(); } -bool check_same_memory_layout(const api::vTensor& t1, const api::vTensor& t2) { - return t1.gpu_memory_layout() == t2.gpu_memory_layout(); +bool check_same_packed_dim(const api::vTensor& t1, const api::vTensor& t2) { + return t1.packed_dim() == t2.packed_dim(); } -bool check_same_memory_layout( +bool check_same_packed_dim( const api::vTensor& t1, const api::vTensor& t2, const api::vTensor& t3) { - if (t1.gpu_memory_layout() != t2.gpu_memory_layout()) { + if (t1.packed_dim() != t2.packed_dim()) { return false; } - return (t1.gpu_memory_layout() == t3.gpu_memory_layout()); + return (t1.packed_dim() == t3.packed_dim()); } // @@ -78,13 +76,15 @@ bool is_packed_dim_broadcasted( const api::vTensor& rcvr) { // We assume that the tensors are broadcastable. If values aren't equal at // some index, then the value of rcvr is 1 and hence should be broadcasted. - switch (sndr.gpu_memory_layout()) { - case utils::kChannelsPacked: + switch (sndr.packed_dim()) { + case WHCN::kChannelsDim: return utils::val_at(-3, sndr.sizes()) > utils::val_at(-3, rcvr.sizes()); - case utils::kHeightPacked: + case WHCN::kHeightDim: return utils::val_at(-2, sndr.sizes()) > utils::val_at(-2, rcvr.sizes()); - case utils::kWidthPacked: + case WHCN::kWidthDim: return utils::val_at(-1, sndr.sizes()) > utils::val_at(-1, rcvr.sizes()); + default: + VK_THROW("Invalid packed dim"); } } diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h index 44155a7ce62..754cc551d0e 100644 --- a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h +++ b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h @@ -34,13 +34,11 @@ bool check_same_sizes_at( const api::vTensor& t2, int64_t d2); -bool check_memory_layout_is( - const api::vTensor& t, - utils::GPUMemoryLayout layout); +bool check_packed_dim_is(const api::vTensor& t, const int32_t packed_dim); -bool check_same_memory_layout(const api::vTensor& t1, const api::vTensor& t2); +bool check_same_packed_dim(const api::vTensor& t1, const api::vTensor& t2); -bool check_same_memory_layout( +bool check_same_packed_dim( const api::vTensor& t1, const api::vTensor& t2, const api::vTensor& t3); diff --git a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp index b0964ace225..2cfb34a052e 100644 --- a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp +++ b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp @@ -66,7 +66,7 @@ uint32_t bind_params_to_descriptor_set( } void bind_staging_to_descriptor_set( - api::StorageBuffer& staging, + api::StagingBuffer& staging, vkapi::DescriptorSet& descriptor_set, const uint32_t idx) { descriptor_set.bind(idx, staging.buffer()); diff --git a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h index 3a7ec029da7..eed39a97979 100644 --- a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h +++ b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h @@ -40,7 +40,7 @@ uint32_t bind_params_to_descriptor_set( const uint32_t base_idx); void bind_staging_to_descriptor_set( - api::StorageBuffer& staging, + api::StagingBuffer& staging, vkapi::DescriptorSet& descriptor_set, const uint32_t idx); diff --git a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp index 89f542de6fc..81d5c9e98af 100644 --- a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp +++ b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp @@ -69,28 +69,26 @@ void add_ndim_suffix(std::string& kernel_name, const api::vTensor& tensor) { } } -void add_memory_layout_suffix( - std::string& kernel_name, - utils::GPUMemoryLayout layout) { - switch (layout) { - case utils::kChannelsPacked: - kernel_name += "_C_packed"; +void add_packed_dim_suffix(std::string& kernel_name, const int32_t packed_dim) { + switch (packed_dim) { + case WHCN::kWidthDim: + kernel_name += "_W_packed"; break; - case utils::kHeightPacked: + case WHCN::kHeightDim: kernel_name += "_H_packed"; break; - case utils::kWidthPacked: - kernel_name += "_W_packed"; + case WHCN::kChannelsDim: + kernel_name += "_C_packed"; break; default: - break; + VK_THROW("Invalid packed dim!"); } } -void add_memory_layout_suffix( +void add_packed_dim_suffix( std::string& kernel_name, const api::vTensor& tensor) { - return add_memory_layout_suffix(kernel_name, tensor.gpu_memory_layout()); + return add_packed_dim_suffix(kernel_name, tensor.packed_dim()); } } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h index e8f4f0d229e..10084054964 100644 --- a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h +++ b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h @@ -29,10 +29,8 @@ void add_dtype_suffix(std::string& kernel_name, const api::vTensor& tensor); void add_ndim_suffix(std::string& kernel_name, const size_t ndim); void add_ndim_suffix(std::string& kernel_name, const api::vTensor& tensor); -void add_memory_layout_suffix( - std::string& kernel_name, - const utils::GPUMemoryLayout layout); -void add_memory_layout_suffix( +void add_packed_dim_suffix(std::string& kernel_name, const int32_t packed_dim); +void add_packed_dim_suffix( std::string& kernel_name, const api::vTensor& tensor); diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp index 294e36b9a86..8804bcf2ef6 100644 --- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp +++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp @@ -13,88 +13,8 @@ #include -#include - namespace vkcompute { -template -void memcpy_to_mapping_impl( - const void* src, - vkapi::MemoryMap& dst_mapping, - const size_t nbytes) { - T* data_ptr = dst_mapping.template data(); - memcpy(data_ptr, reinterpret_cast(src), nbytes); -} - -template -void memcpy_from_mapping_impl( - vkapi::MemoryMap& src_mapping, - void* dst, - const size_t nbytes) { - T* data_ptr = src_mapping.template data(); - memcpy(reinterpret_cast(dst), data_ptr, nbytes); -} - -void memcpy_to_mapping( - const void* src, - vkapi::MemoryMap& dst_mapping, - const size_t nbytes, - const vkapi::ScalarType dtype) { -#define DTYPE_CASE(ctype, vkformat, name) \ - case vkapi::ScalarType::name: \ - memcpy_to_mapping_impl(src, dst_mapping, nbytes); \ - break; - - switch (dtype) { - VK_FORALL_SCALAR_TYPES(DTYPE_CASE) - default: - VK_THROW("Unrecognized dtype!"); - } -#undef DTYPE_CASE -} - -void memcpy_from_mapping( - vkapi::MemoryMap& src_mapping, - void* dst, - const size_t nbytes, - const vkapi::ScalarType dtype) { -#define DTYPE_CASE(ctype, vkformat, name) \ - case vkapi::ScalarType::name: \ - memcpy_from_mapping_impl(src_mapping, dst, nbytes); \ - break; - - switch (dtype) { - VK_FORALL_SCALAR_TYPES(DTYPE_CASE) - default: - VK_THROW("Unrecognized dtype!"); - } -#undef DTYPE_CASE -} - -void copy_ptr_to_staging( - const void* src, - api::StorageBuffer& staging, - const size_t nbytes) { - vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::WRITE); - mapping.invalidate(); - memcpy_to_mapping(src, mapping, nbytes, staging.dtype()); -} - -void copy_staging_to_ptr( - api::StorageBuffer& staging, - void* dst, - const size_t nbytes) { - vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::READ); - mapping.invalidate(); - memcpy_from_mapping(mapping, dst, nbytes, staging.dtype()); -} - -void set_staging_zeros(api::StorageBuffer& staging, const size_t nbytes) { - vkapi::MemoryMap mapping(staging.buffer(), vkapi::MemoryAccessType::WRITE); - uint8_t* data_ptr = mapping.template data(); - memset(data_ptr, 0, staging.nbytes()); -} - vkapi::ShaderInfo get_nchw_to_tensor_shader( const api::vTensor& v_dst, const bool int8_buffer_enabled) { diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h index cabc17f30ee..8d63958a738 100644 --- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h +++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h @@ -12,25 +12,6 @@ namespace vkcompute { -// -// Functions to copy data into and out of a staging buffer -// - -void copy_ptr_to_staging( - const void* src, - api::StorageBuffer& staging, - const size_t nbytes); -void copy_staging_to_ptr( - api::StorageBuffer& staging, - void* dst, - const size_t nbytes); - -void set_staging_zeros(api::StorageBuffer& staging, const size_t nbytes); - -// -// Functions to get shaders -// - vkapi::ShaderInfo get_nchw_to_tensor_shader( const api::vTensor& v_dst, bool int8_buffer_enabled = true); diff --git a/backends/vulkan/runtime/utils/StorageUtils.h b/backends/vulkan/runtime/utils/StorageUtils.h index 3cd60e25fd2..5ada8df8af7 100644 --- a/backends/vulkan/runtime/utils/StorageUtils.h +++ b/backends/vulkan/runtime/utils/StorageUtils.h @@ -8,7 +8,19 @@ #pragma once +#include + namespace vkcompute { + +// Convenience constexpr to attach semantic names to WHCN dimension index +namespace WHCN { + +constexpr int32_t kWidthDim = 0; +constexpr int32_t kHeightDim = 1; +constexpr int32_t kChannelsDim = 2; + +} // namespace WHCN + namespace utils { // @@ -36,20 +48,42 @@ static constexpr StorageType kTexture3D = StorageType::TEXTURE_3D; static constexpr StorageType kTexture2D = StorageType::TEXTURE_2D; /* - * The enum below is used to describe how tensor data is laid out when stored in - * GPU memory; specifically, it indicates how tensor data is packed along a - * texel (i.e. a vector of 4 scalar values). + * A tensor's memory layout is defined in one of two ways: + * + * 1. If it's a buffer backed tensor, the memory layout is defined by its + * `dim_order`, and by extension its `strides`. + * 2. If it's a texture backed tensor, the memory layout is defined by the + * combination of its `axis_map` and its `packed_dim`. * - * Each enum entry indicates which tensor dimension is packed along a texel, and - * it's value is set to the index of that dimension in WHCN dimension order. For - * instance, the width dimension corresponds to index 0, so the - * TENSOR_WIDTH_PACKED enum entry is set to 0. + * Providing explicit memory layout metadata upon tensor construction is not + * very convenient from an API perspective, so the `GPUMemoryLayout` serves as + * an abstraction that is used to determine how to initialize a tensor's layout + * metadata based on the developer's intent. A `GPUMemoryLayout` is provided to + * the constructor of `vTensor`, which will use it to determine how to set its + * `dim_order` if it's a buffer backed tensor, or how to set its `axis_map` and + * `packed_dim` if it's a texture backed tensor. * - * When interpreted as an integer, the enum value can be used as a dim index - * representing the packed dimension. This is used in shaders to resolve tensor - * indexing calculations. + * Note that GPUMemoryLayout is not stored as a tensor property, as it does not + * have any meaning after the vTensor is constructed. After construction, + * methods such as `virtual_transpose()` may be used to modify the tensor's + * layout metadata that cannot be represented by any `GPUMemoryLayout` entry. + * Nonetheless, a "best guess" of the closest memory layout can be produced via + * the `estimate_memory_layout()` API of `vTensor`. + * + * Currently, only 3 memory layouts are provided, but more will be added in the + * future that will enable different functionality such as minimizing texture + * memory footprint. */ enum class GPUMemoryLayout : uint8_t { + /* + * The below memory layouts will produce a `vTensor` with the following + * properties: + * + * 1. For buffer backed tensors, the `dim_order` will be the same as a + * contiguous dim order, but with the specified dim last in the dim order. + * 2. For texture backed tensors, the packed dim will be the specified dim. + * The axis map will be `{0, 1, 2, 2}`. + */ TENSOR_WIDTH_PACKED = 0u, TENSOR_HEIGHT_PACKED = 1u, TENSOR_CHANNELS_PACKED = 2u, @@ -64,14 +98,35 @@ static constexpr GPUMemoryLayout kHeightPacked = static constexpr GPUMemoryLayout kChannelsPacked = GPUMemoryLayout::TENSOR_CHANNELS_PACKED; -/* - * Given a GPUMemoryLayout, return an offset that can be used to determine the - * index of the dimension that is packed along texels, assuming NCHW dimension - * order. The index of the packed dimension will be ndim - offset. - */ template -T to_packed_dim_nchw_offset(const GPUMemoryLayout layout) { - return static_cast(layout) + 1; +T to_packed_dim(const GPUMemoryLayout layout) { + switch (layout) { + case kWidthPacked: + return 0; + case kHeightPacked: + return 1; + case kChannelsPacked: + return 2; + }; + // Should be unreachable + return 0; +} + +inline std::ostream& operator<<( + std::ostream& os, + const GPUMemoryLayout layout) { + switch (layout) { + case kWidthPacked: + os << "TENSOR_WIDTH_PACKED"; + break; + case kHeightPacked: + os << "TENSOR_HEIGHT_PACKED"; + break; + case kChannelsPacked: + os << "TENSOR_CHANNELS_PACKED"; + break; + } + return os; } } // namespace utils diff --git a/backends/vulkan/runtime/utils/VecUtils.h b/backends/vulkan/runtime/utils/VecUtils.h index 55bb0f7d1b5..ad4434cf5af 100644 --- a/backends/vulkan/runtime/utils/VecUtils.h +++ b/backends/vulkan/runtime/utils/VecUtils.h @@ -238,6 +238,28 @@ struct vec final { // NOLINTNEXTLINE Type data[N]; + vec() = default; + + // Standard constructor with initializer list + vec(std::initializer_list values) { + VK_CHECK_COND(values.size() == N); + std::copy(values.begin(), values.end(), data); + } + + // Conversion constructor from an _integral_ vec type. Note that this is only + // defined if `OtherType` is an integral type to disallow implicit narrowing. + template < + typename OtherType, + typename std::enable_if< + !std::is_same::value && + std::is_integral::value, + int>::type = 0> + /* implicit */ vec(const vec& other) { + for (int i = 0; i < N; ++i) { + data[i] = safe_downcast(other[i]); + } + } + const Type& operator[](const uint32_t& i) const { VK_CHECK_COND(i >= 0 && i < N, "Index out of bounds!"); return data[i]; diff --git a/backends/vulkan/runtime/vk_api/memory/Allocation.cpp b/backends/vulkan/runtime/vk_api/memory/Allocation.cpp index b07bb2862d3..908feb0d3fc 100644 --- a/backends/vulkan/runtime/vk_api/memory/Allocation.cpp +++ b/backends/vulkan/runtime/vk_api/memory/Allocation.cpp @@ -26,36 +26,24 @@ namespace vkcompute { namespace vkapi { Allocation::Allocation() - : memory_requirements{}, - create_info{}, - allocator(VK_NULL_HANDLE), - allocation(VK_NULL_HANDLE), - is_copy_(false) {} + : allocator(VK_NULL_HANDLE), allocation(VK_NULL_HANDLE), is_copy_(false) {} Allocation::Allocation( VmaAllocator vma_allocator, const VkMemoryRequirements& mem_props, const VmaAllocationCreateInfo& create_info) - : memory_requirements(mem_props), - create_info(create_info), - allocator(vma_allocator), - allocation(VK_NULL_HANDLE), - is_copy_(false) { + : allocator(vma_allocator), allocation(VK_NULL_HANDLE), is_copy_(false) { VK_CHECK(vmaAllocateMemory( - allocator, &memory_requirements, &create_info, &allocation, nullptr)); + allocator, &mem_props, &create_info, &allocation, nullptr)); } Allocation::Allocation(const Allocation& other) noexcept - : memory_requirements(other.memory_requirements), - create_info(other.create_info), - allocator(other.allocator), + : allocator(other.allocator), allocation(other.allocation), is_copy_(true) {} Allocation::Allocation(Allocation&& other) noexcept - : memory_requirements(other.memory_requirements), - create_info(other.create_info), - allocator(other.allocator), + : allocator(other.allocator), allocation(other.allocation), is_copy_(other.is_copy_) { other.allocation = VK_NULL_HANDLE; @@ -64,8 +52,6 @@ Allocation::Allocation(Allocation&& other) noexcept Allocation& Allocation::operator=(Allocation&& other) noexcept { VmaAllocation tmp_allocation = allocation; - memory_requirements = other.memory_requirements; - create_info = other.create_info; allocator = other.allocator; allocation = other.allocation; is_copy_ = other.is_copy_; diff --git a/backends/vulkan/runtime/vk_api/memory/Allocation.h b/backends/vulkan/runtime/vk_api/memory/Allocation.h index cec6f61e766..e56605e14b2 100644 --- a/backends/vulkan/runtime/vk_api/memory/Allocation.h +++ b/backends/vulkan/runtime/vk_api/memory/Allocation.h @@ -55,9 +55,6 @@ struct Allocation final { ~Allocation(); - VkMemoryRequirements memory_requirements; - // The properties this allocation was created with - VmaAllocationCreateInfo create_info; // The allocator object this was allocated from VmaAllocator allocator; // Handles to the allocated memory @@ -78,6 +75,7 @@ struct Allocation final { } friend class VulkanBuffer; + friend class VulkanImage; }; } // namespace vkapi diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp index 1dadca27a0b..6533f061649 100644 --- a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp +++ b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp @@ -58,6 +58,13 @@ Allocator::~Allocator() { vmaDestroyAllocator(allocator_); } +VmaAllocationCreateInfo Allocator::gpuonly_resource_create_info() { + VmaAllocationCreateInfo alloc_create_info = {}; + alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY; + alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE; + return alloc_create_info; +} + Allocation Allocator::create_allocation( const VkMemoryRequirements& memory_requirements, const VmaAllocationCreateInfo& create_info) { @@ -103,9 +110,7 @@ VulkanImage Allocator::create_image( (VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT); } - VmaAllocationCreateInfo alloc_create_info = {}; - alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY; - alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE; + VmaAllocationCreateInfo alloc_create_info = gpuonly_resource_create_info(); const VulkanImage::ImageProperties image_props{ image_type, @@ -132,45 +137,34 @@ VulkanImage Allocator::create_image( allocate_memory); } -VulkanBuffer Allocator::create_storage_buffer( - const VkDeviceSize size, - const bool gpu_only, - const bool allocate_memory) { +VulkanBuffer Allocator::create_staging_buffer(const VkDeviceSize size) { const VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; VmaAllocationCreateInfo alloc_create_info = {}; alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY; alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE; - // The create storage buffer will be accessed by both the CPU and GPU, so set - // the appropriate flags to indicate that the host device will be accessing + // Staging buffers are accessed by both the CPU and GPU, so set the + // appropriate flags to indicate that the host device will be accessing // the data from this buffer. - if (!gpu_only) { - // Deferred memory allocation should only be used for GPU only buffers. - VK_CHECK_COND( - allocate_memory, - "Only GPU-only buffers should use deferred memory allocation"); - - alloc_create_info.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT; - alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST; - alloc_create_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; - alloc_create_info.preferredFlags = VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | - VK_MEMORY_PROPERTY_HOST_CACHED_BIT; - } + alloc_create_info.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT | + VMA_ALLOCATION_CREATE_MAPPED_BIT; + alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST; + alloc_create_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; + alloc_create_info.preferredFlags = + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT; - return VulkanBuffer( - allocator_, size, alloc_create_info, buffer_usage, allocate_memory); + return VulkanBuffer(allocator_, size, alloc_create_info, buffer_usage); } -VulkanBuffer Allocator::create_staging_buffer(const VkDeviceSize size) { - VmaAllocationCreateInfo alloc_create_info = {}; - alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY; - alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST; - - VkBufferUsageFlags buffer_usage = - VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; +VulkanBuffer Allocator::create_storage_buffer( + const VkDeviceSize size, + const bool allocate_memory) { + const VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; - return VulkanBuffer(allocator_, size, alloc_create_info, buffer_usage); + VmaAllocationCreateInfo alloc_create_info = gpuonly_resource_create_info(); + return VulkanBuffer( + allocator_, size, alloc_create_info, buffer_usage, allocate_memory); } VulkanBuffer Allocator::create_uniform_buffer(const VkDeviceSize size) { @@ -181,9 +175,7 @@ VulkanBuffer Allocator::create_uniform_buffer(const VkDeviceSize size) { VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT; - VulkanBuffer uniform_buffer( - allocator_, size, alloc_create_info, buffer_usage); - return uniform_buffer; + return VulkanBuffer(allocator_, size, alloc_create_info, buffer_usage); } } // namespace vkapi diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.h b/backends/vulkan/runtime/vk_api/memory/Allocator.h index 904163cefb4..56385eb54d7 100644 --- a/backends/vulkan/runtime/vk_api/memory/Allocator.h +++ b/backends/vulkan/runtime/vk_api/memory/Allocator.h @@ -48,6 +48,8 @@ class Allocator final { VmaAllocator allocator_; public: + VmaAllocationCreateInfo gpuonly_resource_create_info(); + Allocation create_allocation( const VkMemoryRequirements& memory_requirements, const VmaAllocationCreateInfo& create_info); @@ -62,13 +64,12 @@ class Allocator final { const bool allow_transfer = false, const bool allocate_memory = true); + VulkanBuffer create_staging_buffer(const VkDeviceSize); + VulkanBuffer create_storage_buffer( const VkDeviceSize, - const bool gpu_only = true, const bool allocate_memory = true); - VulkanBuffer create_staging_buffer(const VkDeviceSize); - /* * Create a uniform buffer with a specified size */ diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.cpp b/backends/vulkan/runtime/vk_api/memory/Buffer.cpp index 366b45a5e41..2af3d9efe31 100644 --- a/backends/vulkan/runtime/vk_api/memory/Buffer.cpp +++ b/backends/vulkan/runtime/vk_api/memory/Buffer.cpp @@ -58,8 +58,6 @@ VulkanBuffer::VulkanBuffer( nullptr, // pQueueFamilyIndices }; - memory_.create_info = allocation_create_info; - if (allocate_memory) { VK_CHECK(vmaCreateBuffer( allocator_, @@ -83,7 +81,7 @@ VulkanBuffer::VulkanBuffer( : buffer_properties_(other.buffer_properties_), allocator_(other.allocator_), memory_(other.memory_), - owns_memory_(other.owns_memory_), + owns_memory_(false), is_copy_(true), handle_(other.handle_) { // TODO: set the offset and range appropriately @@ -137,6 +135,12 @@ VulkanBuffer::~VulkanBuffer() { } } +VmaAllocationInfo VulkanBuffer::allocation_info() const { + VmaAllocationInfo info; + vmaGetAllocationInfo(allocator_, memory_.allocation, &info); + return info; +} + VkMemoryRequirements VulkanBuffer::get_memory_requirements() const { VkMemoryRequirements memory_requirements; vkGetBufferMemoryRequirements(this->device(), handle_, &memory_requirements); diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.h b/backends/vulkan/runtime/vk_api/memory/Buffer.h index 9302048f861..6197a02d402 100644 --- a/backends/vulkan/runtime/vk_api/memory/Buffer.h +++ b/backends/vulkan/runtime/vk_api/memory/Buffer.h @@ -114,9 +114,7 @@ class VulkanBuffer final { return memory_.allocation; } - inline VmaAllocationCreateInfo allocation_create_info() const { - return VmaAllocationCreateInfo(memory_.create_info); - } + VmaAllocationInfo allocation_info() const; inline VkBuffer handle() const { return handle_; diff --git a/backends/vulkan/runtime/vk_api/memory/Image.cpp b/backends/vulkan/runtime/vk_api/memory/Image.cpp index 42352cfb7e7..5029d166166 100644 --- a/backends/vulkan/runtime/vk_api/memory/Image.cpp +++ b/backends/vulkan/runtime/vk_api/memory/Image.cpp @@ -98,6 +98,7 @@ VulkanImage::VulkanImage() allocator_(VK_NULL_HANDLE), memory_{}, owns_memory_(false), + is_copy_(false), handles_{ VK_NULL_HANDLE, VK_NULL_HANDLE, @@ -120,6 +121,7 @@ VulkanImage::VulkanImage( allocator_(vma_allocator), memory_{}, owns_memory_{allocate_memory}, + is_copy_(false), handles_{ VK_NULL_HANDLE, VK_NULL_HANDLE, @@ -157,8 +159,6 @@ VulkanImage::VulkanImage( layout_, // initialLayout }; - memory_.create_info = allocation_create_info; - if (allocate_memory) { VK_CHECK(vmaCreateImage( allocator_, @@ -175,6 +175,17 @@ VulkanImage::VulkanImage( } } +VulkanImage::VulkanImage(const VulkanImage& other) noexcept + : image_properties_(other.image_properties_), + view_properties_(other.view_properties_), + sampler_properties_(other.sampler_properties_), + allocator_(other.allocator_), + memory_(other.memory_), + owns_memory_{false}, + is_copy_(true), + handles_(other.handles_), + layout_(other.layout_) {} + VulkanImage::VulkanImage(VulkanImage&& other) noexcept : image_properties_(other.image_properties_), view_properties_(other.view_properties_), @@ -182,6 +193,7 @@ VulkanImage::VulkanImage(VulkanImage&& other) noexcept allocator_(other.allocator_), memory_(std::move(other.memory_)), owns_memory_(other.owns_memory_), + is_copy_(other.is_copy_), handles_(other.handles_), layout_(other.layout_) { other.handles_.image = VK_NULL_HANDLE; @@ -201,6 +213,7 @@ VulkanImage& VulkanImage::operator=(VulkanImage&& other) noexcept { allocator_ = other.allocator_; memory_ = std::move(other.memory_); owns_memory_ = other.owns_memory_; + is_copy_ = other.is_copy_; handles_ = other.handles_; layout_ = other.layout_; @@ -212,6 +225,13 @@ VulkanImage& VulkanImage::operator=(VulkanImage&& other) noexcept { } VulkanImage::~VulkanImage() { + // Do not destroy any resources if this class instance is a copy of another + // class instance, since this means that this class instance does not have + // ownership of the underlying resource. + if (is_copy_) { + return; + } + if (VK_NULL_HANDLE != handles_.image_view) { vkDestroyImageView(this->device(), handles_.image_view, nullptr); } diff --git a/backends/vulkan/runtime/vk_api/memory/Image.h b/backends/vulkan/runtime/vk_api/memory/Image.h index 1e78f84a5c5..447e980595f 100644 --- a/backends/vulkan/runtime/vk_api/memory/Image.h +++ b/backends/vulkan/runtime/vk_api/memory/Image.h @@ -22,6 +22,12 @@ #include namespace vkcompute { + +// Forward declare vTensor classes such that they can be set as friend classes +namespace api { +class vTensorStorage; +} // namespace api + namespace vkapi { class ImageSampler final { @@ -96,7 +102,23 @@ class VulkanImage final { VkSampler, const bool allocate_memory = true); - VulkanImage(const VulkanImage&) = delete; + protected: + /* + * The Copy constructor allows for creation of a class instance that are + * "aliases" of another class instance. The resulting class instance will not + * have ownership of the underlying VkImage. + * + * This behaviour is analogous to creating a copy of a pointer, thus it is + * unsafe, as the original class instance may be destroyed before the copy. + * These constructors are therefore marked protected so that they may be used + * only in situations where the lifetime of the original class instance is + * guaranteed to exceed, or at least be the same as, the lifetime of the + * copied class instance. + */ + VulkanImage(const VulkanImage& other) noexcept; + + public: + // To discourage creating copies, the assignment operator is still deleted. VulkanImage& operator=(const VulkanImage&) = delete; VulkanImage(VulkanImage&&) noexcept; @@ -123,6 +145,9 @@ class VulkanImage final { Allocation memory_; // Indicates whether the underlying memory is owned by this resource bool owns_memory_; + // Indicates whether this VulkanImage was copied from another VulkanImage, + // thus it does not have ownership of the underlying VKBuffer + bool is_copy_; Handles handles_; // Layout VkImageLayout layout_; @@ -144,10 +169,6 @@ class VulkanImage final { return memory_.allocation; } - inline VmaAllocationCreateInfo allocation_create_info() const { - return VmaAllocationCreateInfo(memory_.create_info); - } - inline VkFormat format() const { return image_properties_.image_format; } @@ -193,10 +214,18 @@ class VulkanImage final { return owns_memory_; } + inline bool is_copy() const { + return is_copy_; + } + inline operator bool() const { return (handles_.image != VK_NULL_HANDLE); } + inline bool is_copy_of(const VulkanImage& other) const { + return (handles_.image == other.handles_.image) && is_copy_; + } + inline void bind_allocation(const Allocation& memory) { VK_CHECK_COND(!memory_, "Cannot bind an already bound allocation!"); VK_CHECK(vmaBindImageMemory(allocator_, memory.allocation, handles_.image)); @@ -207,6 +236,8 @@ class VulkanImage final { } VkMemoryRequirements get_memory_requirements() const; + + friend class api::vTensorStorage; }; struct ImageMemoryBarrier final { diff --git a/backends/vulkan/serialization/vulkan_graph_builder.py b/backends/vulkan/serialization/vulkan_graph_builder.py index da40f0a720b..20d09f1df5c 100644 --- a/backends/vulkan/serialization/vulkan_graph_builder.py +++ b/backends/vulkan/serialization/vulkan_graph_builder.py @@ -24,6 +24,9 @@ Node, NoneType, _ScalarType, TensorSpec, List[_ScalarType], List[Node], str ] +logger: logging.Logger = logging.getLogger("") +logger.setLevel(logging.INFO) + class VkGraphBuilder: def __init__( @@ -351,9 +354,9 @@ def build_graph(self) -> vk_graph_schema.VkGraph: self.process_node(node, call_node_debug_hdl) call_node_debug_hdl += 1 - logging.info("Operators included in this Vulkan partition: ") + logger.info("Operators included in this Vulkan partition: ") for op in self.seen_ops: - logging.info(f" {op.__name__}") + logger.info(f" {op.__name__}") return vk_graph_schema.VkGraph( version="0", diff --git a/backends/vulkan/test/glsl/scalar_add_texture.glsl b/backends/vulkan/test/glsl/scalar_add_texture.glsl new file mode 100644 index 00000000000..992907d0c25 --- /dev/null +++ b/backends/vulkan/test/glsl/scalar_add_texture.glsl @@ -0,0 +1,29 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +layout(std430) buffer; + +${layout_declare_tensor(0, "rw", "t_in", "float", "texture3d")} +${layout_declare_ubo(1, "ivec3", "extents")} +${layout_declare_ubo(2, "int", "scalar")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +void main() { + const ivec3 pos = ivec3(gl_GlobalInvocationID); + if (any(greaterThanEqual(pos, extents))) { + return; + } + + vec4 in_tex = imageLoad(t_in, pos); + imageStore(t_in, pos, imageLoad(t_in, pos) + float(scalar)); +} diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py index c5088ffdb32..9db5cc8a841 100644 --- a/backends/vulkan/test/op_tests/cases.py +++ b/backends/vulkan/test/op_tests/cases.py @@ -8,7 +8,7 @@ from collections import namedtuple from typing import Callable -from executorch.backends.vulkan.test.op_tests.utils.codegen import VkTestSuite +from executorch.backends.vulkan.test.op_tests.utils.test_suite import VkTestSuite # Prime numbers dim sizes for testing @@ -49,6 +49,7 @@ def get_binary_elementwise_inputs(): ((S, S1, S2), (S, S1, S2)), ((S, S1, S2), (S, S1, 1), 2.0), ((S, S1, S2), (S, 1, S2), 2.0), + ((XS, S, S1, S2), (XS, S, 1, 1), 2.0), ] ) test_suite.layouts = [ @@ -465,8 +466,8 @@ def get_view_inputs(): return test_suite -@register_test_suite(["aten.slice.Tensor", "aten.slice_copy.Tensor"]) -def get_slice_inputs(): +@register_test_suite("aten.slice_copy.Tensor") +def get_slice_out_inputs(): Test = namedtuple("VkSliceTest", ["self", "dim", "start", "end", "step"]) Test.__new__.__defaults__ = (None, 0, None, None, 1) @@ -548,6 +549,39 @@ def get_slice_inputs(): return test_suite +def get_slice_view_inputs(): + Test = namedtuple("VkSliceTest", ["self", "dim", "start", "end", "step"]) + Test.__new__.__defaults__ = (None, 0, None, None, 1) + + # Slice by channel + test_cases = [ + Test(self=[1, 17, 1, 10], dim=1, start=0, end=4), + Test(self=[1, 17, 1, 10], dim=1, start=0, end=8), + Test(self=[1, 17, 3, 7], dim=1, start=0, end=12), + ] + + test_suite = VkTestSuite([tuple(tc) for tc in test_cases]) + + test_suite.dtypes = ["at::kFloat"] + test_suite.storage_types = ["utils::kBuffer", "utils::kTexture3D"] + test_suite.layouts = ["utils::kWidthPacked"] + test_suite.data_gen = "make_seq_tensor" + test_suite.is_view_op = True + + return test_suite + + +@register_test_suite(["aten.slice.Tensor"]) +def get_slice_inputs(): + texture_test_suite = get_slice_out_inputs() + texture_test_suite.test_name_suffix = "no_view" + + view_test_suite = get_slice_view_inputs() + view_test_suite.test_name_suffix = "view" + + return [view_test_suite, texture_test_suite] + + @register_test_suite(["aten.transpose.int"]) def get_transpose_inputs(): Test = namedtuple("VkTransposeViewTest", ["self", "dim0", "dim1"]) @@ -558,7 +592,6 @@ def get_transpose_inputs(): Test(self=[M1, S2, M], dim0=0, dim1=1), Test(self=[M1, S2, M], dim0=0, dim1=2), Test(self=[M1, S2, M], dim0=2, dim1=1), - Test(self=[S, M, S2, M2], dim0=0, dim1=2), Test(self=[S, M, S2, M2], dim0=3, dim1=2), Test(self=[S, M, S2, M2], dim0=1, dim1=2), Test(self=[S, M, S2, M2], dim0=3, dim1=1), @@ -567,7 +600,7 @@ def get_transpose_inputs(): test_suite = VkTestSuite([tuple(tc) for tc in test_cases]) test_suite.dtypes = ["at::kFloat"] - test_suite.storage_types = ["utils::kBuffer"] + test_suite.storage_types = ["utils::kBuffer", "utils::kTexture3D"] test_suite.layouts = ["utils::kWidthPacked", "utils::kChannelsPacked"] test_suite.data_gen = "make_seq_tensor" test_suite.is_view_op = True @@ -904,6 +937,7 @@ def get_softmax_inputs(): "aten.neg.default", "aten.cos.default", "aten.hardswish.default", + "aten.hardsigmoid.default", ] ) def get_unary_ops_inputs(): diff --git a/backends/vulkan/test/op_tests/generate_op_benchmarks.py b/backends/vulkan/test/op_tests/generate_op_benchmarks.py new file mode 100644 index 00000000000..7f286123df9 --- /dev/null +++ b/backends/vulkan/test/op_tests/generate_op_benchmarks.py @@ -0,0 +1,88 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os + +from typing import Dict + +from executorch.backends.vulkan.test.op_tests.cases import test_suites + +from executorch.backends.vulkan.test.op_tests.utils.gen_benchmark_vk import ( + VkBenchmarkFileGen, +) +from executorch.backends.vulkan.test.op_tests.utils.gen_computegraph import ( + ComputeGraphGen, +) +from executorch.backends.vulkan.test.op_tests.utils.test_suite import TestSuite +from torchgen import local + +from torchgen.gen import parse_native_yaml, ParsedYaml +from torchgen.model import DispatchKey, NativeFunction + + +def registry_name(f: NativeFunction) -> str: + name = str(f.namespace) + "." + str(f.func.name) + if len(f.func.name.overload_name) == 0: + name += ".default" + return name + + +def construct_f_map(parsed_yaml: ParsedYaml) -> Dict[str, NativeFunction]: + f_map: Dict[str, NativeFunction] = {} + for f in parsed_yaml.native_functions: + f_map[registry_name(f)] = f + return f_map + + +def process_test_suites( + cpp_generator: VkBenchmarkFileGen, + f_map: Dict[str, NativeFunction], + test_suites: Dict[str, TestSuite], +) -> None: + for registry_name, op_test_suites in test_suites.items(): + f = f_map[registry_name] + if isinstance(op_test_suites, list): + for suite in op_test_suites: + cpp_generator.add_suite(registry_name, f, suite) + else: + cpp_generator.add_suite(registry_name, f, op_test_suites) + + +@local.parametrize( + use_const_ref_for_mutable_tensors=False, use_ilistref_for_tensor_lists=False +) +def generate_cpp( + native_functions_yaml_path: str, tags_path: str, output_dir: str +) -> None: + output_file = os.path.join(output_dir, "op_benchmarks.cpp") + cpp_generator = VkBenchmarkFileGen(output_file) + + parsed_yaml = parse_native_yaml(native_functions_yaml_path, tags_path) + f_map = construct_f_map(parsed_yaml) + + ComputeGraphGen.backend_key = parsed_yaml.backend_indices[DispatchKey.CPU] + + process_test_suites(cpp_generator, f_map, test_suites) + + with open(output_file, "w") as file: + file.write(cpp_generator.generate_cpp()) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--aten-yaml-path", + help="path to native_functions.yaml file.", + ) + parser.add_argument( + "--tags-path", + help="Path to tags.yaml. Required by yaml parsing in gen_correctness_vk system.", + ) + + parser.add_argument("-o", "--output", help="Output directory", required=True) + args = parser.parse_args() + generate_cpp(args.aten_yaml_path, args.tags_path, args.output) diff --git a/backends/vulkan/test/op_tests/generate_op_tests.py b/backends/vulkan/test/op_tests/generate_op_correctness_tests.py similarity index 68% rename from backends/vulkan/test/op_tests/generate_op_tests.py rename to backends/vulkan/test/op_tests/generate_op_correctness_tests.py index 71047ac6f49..4e51e23940b 100644 --- a/backends/vulkan/test/op_tests/generate_op_tests.py +++ b/backends/vulkan/test/op_tests/generate_op_correctness_tests.py @@ -10,12 +10,14 @@ from typing import Dict from executorch.backends.vulkan.test.op_tests.cases import test_suites +from executorch.backends.vulkan.test.op_tests.utils.gen_computegraph import ( + ComputeGraphGen, +) -from executorch.backends.vulkan.test.op_tests.utils.codegen import VkCppTestFileGen -from executorch.backends.vulkan.test.op_tests.utils.codegen_base import ( - TestSuite, - TestSuiteGen, +from executorch.backends.vulkan.test.op_tests.utils.gen_correctness_vk import ( + VkCorrectnessTestFileGen, ) +from executorch.backends.vulkan.test.op_tests.utils.test_suite import TestSuite from torchgen import local from torchgen.gen import parse_native_yaml, ParsedYaml @@ -37,13 +39,17 @@ def construct_f_map(parsed_yaml: ParsedYaml) -> Dict[str, NativeFunction]: def process_test_suites( - cpp_generator: VkCppTestFileGen, + cpp_generator: VkCorrectnessTestFileGen, f_map: Dict[str, NativeFunction], test_suites: Dict[str, TestSuite], ) -> None: - for registry_name, op_test_suite in test_suites.items(): + for registry_name, op_test_suites in test_suites.items(): f = f_map[registry_name] - cpp_generator.add_suite(registry_name, f, op_test_suite) + if isinstance(op_test_suites, list): + for suite in op_test_suites: + cpp_generator.add_suite(registry_name, f, suite) + else: + cpp_generator.add_suite(registry_name, f, op_test_suites) @local.parametrize( @@ -53,12 +59,12 @@ def generate_cpp( native_functions_yaml_path: str, tags_path: str, output_dir: str ) -> None: output_file = os.path.join(output_dir, "op_tests.cpp") - cpp_generator = VkCppTestFileGen(output_file) + cpp_generator = VkCorrectnessTestFileGen(output_file) parsed_yaml = parse_native_yaml(native_functions_yaml_path, tags_path) f_map = construct_f_map(parsed_yaml) - TestSuiteGen.backend_key = parsed_yaml.backend_indices[DispatchKey.CPU] + ComputeGraphGen.backend_key = parsed_yaml.backend_indices[DispatchKey.CPU] process_test_suites(cpp_generator, f_map, test_suites) @@ -67,16 +73,14 @@ def generate_cpp( if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Generate a simple Hello World C++ program." - ) + parser = argparse.ArgumentParser() parser.add_argument( "--aten-yaml-path", help="path to native_functions.yaml file.", ) parser.add_argument( "--tags-path", - help="Path to tags.yaml. Required by yaml parsing in codegen system.", + help="Path to tags.yaml. Required by yaml parsing in gen_correctness_vk system.", ) parser.add_argument("-o", "--output", help="Output directory", required=True) args = parser.parse_args() diff --git a/backends/vulkan/test/op_tests/targets.bzl b/backends/vulkan/test/op_tests/targets.bzl index 0cffb5d80be..9b6ea61de21 100644 --- a/backends/vulkan/test/op_tests/targets.bzl +++ b/backends/vulkan/test/op_tests/targets.bzl @@ -8,9 +8,22 @@ def define_common_targets(is_fbcode = False): return runtime.python_library( - name = "generate_op_tests_lib", + name = "generate_op_correctness_tests_lib", srcs = native.glob(["utils/*.py"]) + [ - "generate_op_tests.py", + "generate_op_correctness_tests.py", + "cases.py", + ], + base_module = "executorch.backends.vulkan.test.op_tests", + deps = [ + "fbsource//third-party/pypi/expecttest:expecttest", + ], + external_deps = ["torchgen"], + ) + + runtime.python_library( + name = "generate_op_benchmarks_lib", + srcs = native.glob(["utils/*.py"]) + [ + "generate_op_benchmarks.py", "cases.py", ], base_module = "executorch.backends.vulkan.test.op_tests", @@ -21,23 +34,31 @@ def define_common_targets(is_fbcode = False): ) runtime.python_binary( - name = "generate_op_tests", - main_module = "executorch.backends.vulkan.test.op_tests.generate_op_tests", + name = "generate_op_correctness_tests", + main_module = "executorch.backends.vulkan.test.op_tests.generate_op_correctness_tests", deps = [ - ":generate_op_tests_lib", + ":generate_op_correctness_tests_lib", + ], + ) + + runtime.python_binary( + name = "generate_op_benchmarks", + main_module = "executorch.backends.vulkan.test.op_tests.generate_op_benchmarks", + deps = [ + ":generate_op_benchmarks_lib", ], ) aten_src_path = runtime.external_dep_location("aten-src-path") genrule_cmd = [ - "$(exe :generate_op_tests)", + "$(exe :generate_op_correctness_tests)", "--tags-path $(location {})/aten/src/ATen/native/tags.yaml".format(aten_src_path), "--aten-yaml-path $(location {})/aten/src/ATen/native/native_functions.yaml".format(aten_src_path), "-o $OUT", ] runtime.genrule( - name = "generated_op_tests_cpp", + name = "generated_op_correctness_tests_cpp", outs = { "op_tests.cpp": ["op_tests.cpp"], }, @@ -45,6 +66,22 @@ def define_common_targets(is_fbcode = False): default_outs = ["."], ) + benchmarks_genrule_cmd = [ + "$(exe :generate_op_benchmarks)", + "--tags-path $(location {})/aten/src/ATen/native/tags.yaml".format(aten_src_path), + "--aten-yaml-path $(location {})/aten/src/ATen/native/native_functions.yaml".format(aten_src_path), + "-o $OUT", + ] + + runtime.genrule( + name = "generated_op_benchmarks_cpp", + outs = { + "op_benchmarks.cpp": ["op_benchmarks.cpp"], + }, + cmd = " ".join(benchmarks_genrule_cmd), + default_outs = ["."], + ) + pt_operator_library( name = "all_aten_ops", check_decl = False, @@ -66,7 +103,7 @@ def define_common_targets(is_fbcode = False): runtime.cxx_binary( name = "compute_graph_op_tests_bin", srcs = [ - ":generated_op_tests_cpp[op_tests.cpp]", + ":generated_op_correctness_tests_cpp[op_tests.cpp]", ], define_static_target = False, deps = [ @@ -76,10 +113,26 @@ def define_common_targets(is_fbcode = False): ], ) + runtime.cxx_binary( + name = "compute_graph_op_benchmarks_bin", + srcs = [ + ":generated_op_benchmarks_cpp[op_benchmarks.cpp]", + ], + compiler_flags = [ + "-Wno-unused-variable", + ], + define_static_target = False, + deps = [ + "//third-party/benchmark:benchmark", + "//executorch/backends/vulkan:vulkan_graph_runtime", + ":all_aten_ops_lib", + ], + ) + runtime.cxx_test( name = "compute_graph_op_tests", srcs = [ - ":generated_op_tests_cpp[op_tests.cpp]", + ":generated_op_correctness_tests_cpp[op_tests.cpp]", ], contacts = ["oncall+ai_infra_mobile_platform@xmail.facebook.com"], fbandroid_additional_loaded_sonames = [ diff --git a/backends/vulkan/test/op_tests/utils/aten_types.py b/backends/vulkan/test/op_tests/utils/aten_types.py new file mode 100644 index 00000000000..186f5afb78b --- /dev/null +++ b/backends/vulkan/test/op_tests/utils/aten_types.py @@ -0,0 +1,30 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +#################### +## ATen C++ Types ## +#################### + +AT_INT_ARRAY_REF = "at::IntArrayRef" +AT_SCALAR = "at::Scalar" +AT_TENSOR = "at::Tensor" +AT_TENSOR_LIST = "at::TensorList" +BOOL = "bool" +DOUBLE = "double" +INT = "int64_t" +OPT_AT_DOUBLE_ARRAY_REF = "::std::optional>" +OPT_AT_INT_ARRAY_REF = "at::OptionalIntArrayRef" +OPT_AT_TENSOR = "::std::optional" +OPT_BOOL = "::std::optional" +OPT_INT64 = "::std::optional" +OPT_DEVICE = "::std::optional" +OPT_LAYOUT = "::std::optional" +OPT_MEMORY_FORMAT = "::std::optional" +OPT_SCALAR_TYPE = "::std::optional" +STRING = "c10::string_view" +TWO_TENSOR_TUPLE = "::std::tuple" +THREE_TENSOR_TUPLE = "::std::tuple" +TENSOR_VECTOR = "::std::vector" diff --git a/backends/vulkan/test/op_tests/utils/gen_benchmark_vk.py b/backends/vulkan/test/op_tests/utils/gen_benchmark_vk.py new file mode 100644 index 00000000000..fb42d982f67 --- /dev/null +++ b/backends/vulkan/test/op_tests/utils/gen_benchmark_vk.py @@ -0,0 +1,335 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import re + +from executorch.backends.vulkan.test.op_tests.utils.gen_computegraph import ( + ComputeGraphGen, +) +from executorch.backends.vulkan.test.op_tests.utils.gen_correctness_base import ( + CorrectnessTestGen, +) +from executorch.backends.vulkan.test.op_tests.utils.test_suite import TestSuite + +from torchgen.model import NativeFunction + +########################## +## Test Suite Generation ## +########################## + +benchmark_fixture_template = """ +class GeneratedOpBenchmark_{op_name} : public ::benchmark::Fixture {{ + protected: + ComputeGraph* graph; + at::ScalarType test_dtype = at::kFloat; + float rtol = {rtol}; + float atol = {atol}; + + {arg_valuerefs} + + void SetUp(::benchmark::State& state) override {{ + GraphConfig config; + config.descriptor_pool_safety_factor = 2.0; + test_dtype = at::ScalarType(state.range(0)); + const utils::StorageType storage_type = utils::StorageType(state.range(1)); + const utils::GPUMemoryLayout memory_layout = utils::GPUMemoryLayout(state.range(2)); + config.set_storage_type_override(storage_type); + config.set_memory_layout_override(memory_layout); + config.enable_querypool = true; + graph = new ComputeGraph(config); + }} + + void TearDown(::benchmark::State& state) override {{ + delete graph; + graph = nullptr; + }} + + {build_graph_fn} + {benchmark_fn} +}}; +""" + +benchmark_template = """ +BENCHMARK_DEFINE_F(GeneratedOpBenchmark_{op_name}, {case_name})(benchmark::State& state) {{ + {skips} + {create_ref_data} + {call_build_graph} + ShaderTimes shader_times; + for (auto _ : state) {{ + {call_benchmark} + graph->context()->querypool().extract_results(); + QueryPoolResults results = graph->context()->querypool().get_shader_timestamp_data(); + process_querypool_results(results, shader_times); + }} + register_shader_time_counters(state, shader_times); +}} + +BENCHMARK_REGISTER_F(GeneratedOpBenchmark_{op_name}, {case_name})->Threads(1)->ArgsProduct({combos}); +""" + + +class VkBenchmarkGen(CorrectnessTestGen): + def __init__(self, op_reg_name: str, f: NativeFunction, inputs: TestSuite): + super().__init__(f, inputs) + self.op_reg_name = op_reg_name + self.generator = ComputeGraphGen(self.op_reg_name, self.f, self.suite_def) + + def gen_call_benchmark(self, prepack=False) -> str: + test_str = f"benchmark_{self.op_name}(" + if prepack: + test_str = f"prepacked_benchmark_{self.op_name}(" + for binding in self.f_sig.arguments(): + arg = binding.argument + test_str += f"{arg.name}, " + test_str = test_str[:-2] + ");" + test_str = re.sub(r"^", " ", test_str, flags=re.M) + return test_str + + def gen_call_build_graph(self, prepack=False) -> str: + test_str = f"build_graph_{self.op_name}(" + if prepack: + test_str = f"prepacked_build_graph_{self.op_name}(" + for binding in self.f_sig.arguments(): + arg = binding.argument + test_str += f"{arg.name}, " + test_str = test_str[:-2] + ");" + test_str = re.sub(r"^", " ", test_str, flags=re.M) + return test_str + + def gen_combos(self, inputs) -> str: + dtypes_list = ", ".join(f"int({dtype})" for dtype in self.suite_def.dtypes) + storage_types_list = ", ".join( + f"int({storage_type})" for storage_type in self.suite_def.storage_types + ) + layouts_list = ", ".join(f"int({layout})" for layout in self.suite_def.layouts) + return f"{{ {{ {dtypes_list} }}, {{ {storage_types_list} }}, {{ {layouts_list} }} }}" + + def generate_benchmark_case(self, inputs, prepack=False) -> str: + return benchmark_template.format( + op_name=f"{self.op_name}", + case_name=self.gen_case_name(inputs, prepack), + skips=self.generator.gen_conditional_skips( + 'state.SkipWithError("unsupported type"); return;' + ), + create_ref_data=self.gen_create_ref_data(inputs), + call_build_graph=self.gen_call_build_graph(prepack), + call_benchmark=self.gen_call_benchmark(prepack), + combos=self.gen_combos(inputs), + ) + + def generate_benchmark(self) -> str: + benchmarks_cpp = "" + for inputs in self.suite_def.input_cases: + if not self.suite_def.requires_prepack: + benchmarks_cpp += self.generate_benchmark_case(inputs) + if self.suite_def.supports_prepack(): + benchmarks_cpp += self.generate_benchmark_case(inputs, prepack=True) + return benchmarks_cpp + + def generate_benchmark_fixture(self) -> str: + build_graph_fn = "" + benchmark_fn = "" + if not self.suite_def.requires_prepack: + build_graph_fn = self.generator.gen_build_graph_fn() + benchmark_fn = self.generator.gen_op_exec_graph_fn() + + prepacked_build_graph_fn = "" + prepacked_benchmark_fn = "" + if self.suite_def.supports_prepack(): + self.generator.should_prepack = True + prepacked_build_graph_fn = self.generator.gen_build_graph_fn() + build_graph_fn += "\n\n " + build_graph_fn += prepacked_build_graph_fn + prepacked_benchmark_fn = self.generator.gen_op_exec_graph_fn() + benchmark_fn += "\n\n " + benchmark_fn += prepacked_benchmark_fn + + return benchmark_fixture_template.format( + op_name=self.op_name, + build_graph_fn=build_graph_fn, + benchmark_fn=benchmark_fn, + rtol=self.suite_def.rtol, + arg_valuerefs=self.generator.gen_arg_valueref_decls(), + atol=self.suite_def.atol, + ) + + +########################## +## Test File Generation ## +########################## + +cpp_test_template = """ +#include +#include +#include + +#include +#include +#include + +using namespace vkcompute; +using TensorOptions = at::TensorOptions; + +vkapi::ScalarType from_at_scalartype(c10::ScalarType at_scalartype) {{ + switch (at_scalartype) {{ + case c10::kFloat: + return vkapi::kFloat; + case c10::kHalf: + return vkapi::kHalf; + case c10::kInt: + return vkapi::kInt; + case c10::kLong: + return vkapi::kInt; + case c10::kChar: + return vkapi::kChar; + default: + VK_THROW("Unsupported at::ScalarType!"); + }} +}} + +at::Tensor make_rand_tensor( + std::vector sizes, + at::ScalarType dtype = at::kFloat, + float low = 0.0, + float high = 1.0) {{ + if (high == 1.0 && low == 0.0) + return at::rand(sizes, at::device(at::kCPU).dtype(dtype)); + + if (dtype == at::kChar) + return at::randint(high, sizes, at::device(at::kCPU).dtype(dtype)); + + return at::rand(sizes, at::device(at::kCPU).dtype(dtype)) * (high - low) + low; +}} + +at::Tensor make_seq_tensor( + std::vector sizes, + at::ScalarType dtype = at::kFloat, + float low = 0.0, + float high = 1.0) {{ + (void)low; + (void)high; + + int64_t n = 1; + for (auto size: sizes) {{ + n *= size; + }} + + std::vector values(n); + for (int i=0;i indices) {{ + at::ScalarType dtype = at::kInt; + std::vector sizes = {{static_cast(indices.size())}}; + + // Clone as original data will be deallocated upon return. + return at::from_blob(indices.data(), sizes, dtype).detach().clone(); +}} + +at::Tensor make_index_tensor(std::vector> indices) {{ + at::ScalarType dtype = at::kInt; + std::vector sizes = {{ + static_cast(indices.size()), + static_cast(indices[0].size())}}; + + // Flatten indices as from_blob reads garbage otherwise. + std::vector acc; + for (auto& vec: indices) {{ + acc.insert(acc.end(), vec.begin(), vec.end()); + }} + + // Clone as original data will be deallocated upon return. + return at::from_blob(acc.data(), sizes, dtype).detach().clone(); +}} + +at::Tensor make_index_tensor(std::vector>> indices) {{ + at::ScalarType dtype = at::kInt; + std::vector sizes = {{ + static_cast(indices.size()), + static_cast(indices[0].size()), + static_cast(indices[0][0].size())}}; + + // Flatten indices as from_blob reads garbage otherwise. + std::vector acc; + for (auto& v: indices) {{ + for (auto& vv: v) {{ + acc.insert(acc.end(), vv.begin(), vv.end()); + }} + }} + + // Clone as original data will be deallocated upon return. + return at::from_blob(acc.data(), sizes, dtype).detach().clone(); +}} + +using ShaderEntry = std::tuple; +using QueryPoolResults = std::vector; +using ShaderTimes = std::unordered_map>; + +void process_querypool_results( + QueryPoolResults& results, + ShaderTimes& shader_times) {{ + for (const ShaderEntry& entry : results) {{ + std::string kernel_name = std::get<0>(entry); + std::uint64_t start_ns = std::get<2>(entry); + std::uint64_t end_ns = std::get<3>(entry); + std::uint64_t duration_ns = end_ns - start_ns; + if (shader_times.find(kernel_name) == shader_times.end()) {{ + shader_times[kernel_name] = std::vector(); + }} + shader_times[kernel_name].emplace_back(duration_ns); + }} +}} + +void register_shader_time_counters( + benchmark::State& state, + ShaderTimes& shader_times) {{ + for (auto& times_list : shader_times) {{ + // Filter to_nchw and nchw_to shaders + if (times_list.first.find("to_nchw") != std::string::npos) {{ + continue; + }} + if (times_list.first.find("nchw_to") != std::string::npos) {{ + continue; + }} + + std::sort(times_list.second.begin(), times_list.second.end()); + uint64_t median_time; + median_time = times_list.second[times_list.second.size() / 2]; + state.counters[times_list.first + " median ns"] = median_time; + }} +}} + +{benchmark_fixtures} + +{def_benchmarks} +""" + + +class VkBenchmarkFileGen: + def __init__(self, out_path): + self.out_path = out_path + self.suites_gens = [] + + def add_suite(self, op_reg_name: str, f: NativeFunction, all_input_cases) -> None: + suites_gen = VkBenchmarkGen(op_reg_name, f, all_input_cases) + self.suites_gens.append(suites_gen) + + def generate_benchmarks_cpp(self) -> str: + return "\n".join([h.generate_benchmark() for h in self.suites_gens]) + + def generate_benchmark_fixtures(self) -> str: + return "\n".join([h.generate_benchmark_fixture() for h in self.suites_gens]) + + def generate_cpp(self) -> str: + return cpp_test_template.format( + benchmark_fixtures=self.generate_benchmark_fixtures(), + def_benchmarks=self.generate_benchmarks_cpp(), + ) diff --git a/backends/vulkan/test/op_tests/utils/codegen.py b/backends/vulkan/test/op_tests/utils/gen_computegraph.py similarity index 77% rename from backends/vulkan/test/op_tests/utils/codegen.py rename to backends/vulkan/test/op_tests/utils/gen_computegraph.py index b39801e7660..f6ee9c78a14 100644 --- a/backends/vulkan/test/op_tests/utils/codegen.py +++ b/backends/vulkan/test/op_tests/utils/gen_computegraph.py @@ -6,15 +6,14 @@ import re from dataclasses import dataclass -from typing import Any, List, Optional, Union +from typing import List, Optional, Union -from executorch.backends.vulkan.test.op_tests.utils.codegen_base import ( +from executorch.backends.vulkan.test.op_tests.utils.aten_types import ( AT_INT_ARRAY_REF, AT_SCALAR, AT_TENSOR, AT_TENSOR_LIST, BOOL, - CppTestFileGen, DOUBLE, INT, OPT_AT_DOUBLE_ARRAY_REF, @@ -28,37 +27,20 @@ OPT_SCALAR_TYPE, STRING, TENSOR_VECTOR, - TestSuite, - TestSuiteGen, THREE_TENSOR_TUPLE, TWO_TENSOR_TUPLE, ) +from executorch.backends.vulkan.test.op_tests.utils.test_suite import TestSuite from torchgen.api import cpp from torchgen.api.types import CppSignatureGroup - from torchgen.gen import generate_static_dispatch_backend_call, translate_args - from torchgen.gen_aoti_c_shim import gen_static_dispatch_backend_call_signature from torchgen.model import NativeFunction, Variant -################################## -## Custom Test Suite Definition ## -################################## - - -@dataclass -class VkTestSuite(TestSuite): - def __init__(self, input_cases: List[Any]): - super().__init__(input_cases) - self.storage_types: List[str] = ["utils::kTexture3D"] - self.layouts: List[str] = ["utils::kChannelsPacked"] - self.data_gen: str = "make_rand_tensor" - - -########################## -## Code Generator Class ## -########################## +################################### +## Compute Graph Code Generation ## +################################### @dataclass @@ -105,6 +87,8 @@ def vk_out(self): class ComputeGraphGen: + backend_key = None + def __init__(self, op_reg_name: str, f: NativeFunction, suite_def: TestSuite): self.op_reg_name = op_reg_name self.f = f @@ -230,7 +214,7 @@ def gen_decl(self, fn_name: str, ret_type: str = "void") -> str: def create_aten_fn_call(self) -> str: func_call = generate_static_dispatch_backend_call( - self.f_sig, self.f, TestSuiteGen.backend_key + self.f_sig, self.f, ComputeGraphGen.backend_key )[7:].replace("::cpu", "") return func_call @@ -244,11 +228,12 @@ def create_aten_method_call(self) -> str: func_call = f"ATEN_FN({self.f_sig.name()})({exprs});" return func_call - def create_out_src(self) -> str: + def create_out_src(self, include_declarations: bool = True) -> str: + cpp_type = self.out.cpp_type if include_declarations else "" if Variant.function in self.f.variants: - return f"{self.out.cpp_type} out = " + self.create_aten_fn_call() + "\n" + return f"{cpp_type} out = " + self.create_aten_fn_call() + "\n" else: - return f"{self.out.cpp_type} out = " + self.create_aten_method_call() + "\n" + return f"{cpp_type} out = " + self.create_aten_method_call() + "\n" ## Graph code generation utils @@ -258,7 +243,28 @@ def prepack_ref(self, ref: ValueRef) -> bool: else: return ref.supports_prepack and self.should_prepack - def create_value_for(self, ref: ValueRefList) -> str: # noqa: C901 + def create_value_decl_for(self, ref: ValueRefList) -> str: # noqa: C901 + if isinstance(ref, list): + ret_str = "" + for r in ref: + ret_str += self.create_value_decl_for(r) + return ret_str + + cpp_type = "IOValueRef" if (ref.is_in or ref.requires_prepack) else "ValueRef" + if ref.src_cpp_type == AT_TENSOR_LIST: + ret_str = f"std::vector {ref.name}_io_value_refs;\n" + ret_str += f"std::vector {ref.name}_value_refs;\n" + return ret_str + elif ref.src_cpp_type == TENSOR_VECTOR: + ret_str = f"std::vector {ref.io_value_list_name};\n" + ret_str += f"std::vector {ref.value_list_name};\n" + return ret_str + else: + return f"{cpp_type} {ref.name};\n" + + def create_value_for( # noqa: C901 + self, ref: ValueRefList, include_declarations: bool = True + ) -> str: if isinstance(ref, list): ret_str = "" for r in ref: @@ -269,9 +275,16 @@ def create_value_for(self, ref: ValueRefList) -> str: # noqa: C901 ref_is_view = self.suite_def.is_view_op and ref.is_out cpp_type = "IOValueRef" if (ref.is_in and not prepack) else "ValueRef" + if not include_declarations: + cpp_type = "" if ref.src_cpp_type == OPT_AT_TENSOR: ret_str = f"{cpp_type} {ref.name} = " + if prepack: + ret_str = "" + if include_declarations: + ret_str += f"IOValueRef {ref.name};\n" + ret_str += f"{ref.name}.value = " ret_str += f"!{ref.src_cpp_name}.has_value() ? " ret_str += f"{self.graph}{self.dot}add_none() : " if not prepack: @@ -308,11 +321,13 @@ def create_value_for(self, ref: ValueRefList) -> str: # noqa: C901 # each tensor, to facilate staging. On the other hand, we will # use the .value tensor to create a ValueList, which will be passed # to the corresponding ops. - ret_str = f"std::vector {ref.name}_io_value_refs;\n" - ret_str += f"std::vector {ref.name}_value_refs;\n" + ret_str = "" + if include_declarations: + ret_str += f"std::vector {ref.name}_io_value_refs;\n" + ret_str += f"std::vector {ref.name}_value_refs;\n" ret_str += f"for (int i=0; i < {ref.src_cpp_name}.size(); i++) {{\n" ret_str += ( - f" {cpp_type} io_value_ref = {self.graph}{self.dot}add_input_tensor(\n" + f" IOValueRef io_value_ref = {self.graph}{self.dot}add_input_tensor(\n" ) ret_str += f" {ref.src_cpp_name}[i].sizes().vec(),\n" ret_str += ( @@ -324,9 +339,11 @@ def create_value_for(self, ref: ValueRefList) -> str: # noqa: C901 ret_str += f"ValueRef {ref.name} = {self.graph}{self.dot}add_value_list(std::move({ref.name}_value_refs));\n" return ret_str elif ref.src_cpp_type == TENSOR_VECTOR: - ret_str = f""" -std::vector {ref.io_value_list_name}; -std::vector {ref.value_list_name}; + ret_str = "" + if include_declarations: + ret_str += f"std::vector {ref.io_value_list_name};\n" + ret_str += f"std::vector {ref.value_list_name};\n" + ret_str += f""" for (int i=0; i str: # noqa: C901 return ret_str ret_str = f"{cpp_type} {ref.name} = {self.graph}{self.dot}" + if prepack: + ret_str = "" + if include_declarations: + ret_str = f"IOValueRef {ref.name};\n" + ret_str += f"{ref.name}.value = {self.graph}{self.dot}" + if ref.src_cpp_type == AT_TENSOR and ref_is_view: input_name = None for _name, ref in self.refs.items(): @@ -347,8 +370,7 @@ def create_value_for(self, ref: ValueRefList) -> str: # noqa: C901 input_name = ref.name assert input_name is not None - ret_str += "add_tensor_view(" + input_name + ".value);" - pass + ret_str += f"add_tensor_view({input_name}.value);" elif ref.src_cpp_type == AT_TENSOR and not prepack: ret_str += "add_input_tensor(" if ref.is_in else "add_tensor(" ret_str += f"{ref.src_cpp_name}.sizes().vec(), " @@ -400,14 +422,29 @@ def create_op_call(self) -> str: else: op_create_code += ( f"{ref.name}.value, " - if (ref.is_in and not self.prepack_ref(ref)) or ref.is_out + if ref.is_in or ref.requires_prepack or ref.is_out else f"{ref.name}, " ) + # op_create_code += f"{ref.name}, " op_create_code += "out_ref});\n" return op_create_code - def set_output(self, ref: ValueRefList) -> str: + def gen_output_staging_valueref_decl(self, ref: ValueRefList) -> str: + if isinstance(ref, list): + ret_str = "" + for r in ref[:-1]: + ret_str += self.gen_output_staging_valueref_decl(r) + return ret_str + elif ref.src_cpp_type == TENSOR_VECTOR: + assert ref.is_out + ret_str = "" + return ret_str + + assert ref.src_cpp_type == AT_TENSOR and ref.is_out + return f"ValueRef {ref.name}_staging;\n" + + def set_output(self, ref: ValueRefList, include_declarations: bool = True) -> str: if isinstance(ref, list): ret_str = "" for r in ref[:-1]: @@ -424,7 +461,8 @@ def set_output(self, ref: ValueRefList) -> str: return ret_str assert ref.src_cpp_type == AT_TENSOR and ref.is_out - ret_str = f"ValueRef {ref.name}_staging = {self.graph}{self.dot}" + cpptype = "ValueRef" if include_declarations else "" + ret_str = f"{cpptype} {ref.name}_staging = {self.graph}{self.dot}" ret_str += f"set_output_tensor({ref.name});\n" return ret_str @@ -542,15 +580,28 @@ def check_graph_out(self, ref: ValueRefList) -> str: ## Top level code generation - def gen_graph_build_code(self) -> str: - graph_build = self.create_out_src() + def gen_arg_valueref_decls(self) -> str: + ret_str = "" + for aten_arg in self.args: + ref = self.refs[aten_arg.name] + ret_str += self.create_value_decl_for(ref) + + ret_str += self.create_value_decl_for(self.refs["out"]) + ret_str += f"{self.out.cpp_type} out;\n" + ret_str += self.gen_output_staging_valueref_decl(self.refs["out"]) + return ret_str + + def gen_graph_build_code(self, include_declarations: bool = True) -> str: + graph_build = self.create_out_src(include_declarations) for aten_arg in self.args: - graph_build += self.create_value_for(self.refs[aten_arg.name]) + graph_build += self.create_value_for( + self.refs[aten_arg.name], include_declarations + ) - graph_build += self.create_value_for(self.refs["out"]) + graph_build += self.create_value_for(self.refs["out"], include_declarations) graph_build += self.create_op_call() - graph_build += self.set_output(self.refs["out"]) + graph_build += self.set_output(self.refs["out"], include_declarations) graph_build += f"{self.graph}{self.dot}prepare();\n" graph_build += f"{self.graph}{self.dot}encode_prepack();\n" @@ -560,7 +611,7 @@ def gen_graph_build_code(self) -> str: graph_build += "\n" return graph_build - def gen_graph_exec_code(self) -> str: + def gen_graph_exec_code(self, check_output=True) -> str: graph_exec = "" for aten_arg in self.args: ref = self.refs[aten_arg.name] @@ -573,26 +624,27 @@ def gen_graph_exec_code(self) -> str: graph_exec += self.declare_vk_out_for(self.refs["out"]) graph_exec += self.copy_from_staging(self.refs["out"]) - graph_exec += self.check_graph_out(self.refs["out"]) + if check_output: + graph_exec += self.check_graph_out(self.refs["out"]) graph_exec = re.sub(r"^", " ", graph_exec, flags=re.M) graph_exec = "{\n" + graph_exec + "\n}" return graph_exec - def gen_conditional_skips(self) -> str: + def gen_conditional_skips(self, skip_str: str = "GTEST_SKIP();") -> str: fp16_skip = f"if (!{self.graph}{self.dot}context()->adapter_ptr()->has_full_float16_buffers_support()) {{\n" - fp16_skip += " GTEST_SKIP();\n" + fp16_skip += f" {skip_str}\n" fp16_skip += "}" fp16_skip = re.sub(r"^", " ", fp16_skip, flags=re.M) + "\n" int8_skip = f"if (!{self.graph}{self.dot}context()->adapter_ptr()->has_full_int8_buffers_support()) {{\n" - int8_skip += " GTEST_SKIP();\n" + int8_skip += f" {skip_str};\n" int8_skip += "}\n" skips = "" - skips = "if (test_dtype == at::kHalf) {\n" + skips += "if (test_dtype == at::kHalf) {\n" skips += fp16_skip skips += "}\n" @@ -606,6 +658,9 @@ def gen_conditional_skips(self) -> str: def gen_op_check_fn(self) -> str: op_name = self.f.func.name.unambiguous_name() + if self.suite_def.test_name_suffix is not None: + op_name += "_" + self.suite_def.test_name_suffix + op_check_fn = self.gen_decl(f"check_{op_name}") + " {\n" if self.should_prepack: op_check_fn = self.gen_decl(f"prepacked_check_{op_name}") + " {\n" @@ -622,146 +677,36 @@ def gen_op_check_fn(self) -> str: return op_check_fn + def gen_build_graph_fn(self, include_declarations: bool = False) -> str: + op_name = self.f.func.name.unambiguous_name() + if self.suite_def.test_name_suffix is not None: + op_name += "_" + self.suite_def.test_name_suffix + op_build_graph_fn = self.gen_decl(f"build_graph_{op_name}") + " {\n" + if self.should_prepack: + op_build_graph_fn = ( + self.gen_decl(f"prepacked_build_graph_{op_name}") + " {\n" + ) -################################## -## Test Fixture Code Generation ## -################################## - -test_fixture_template = """ -class GeneratedOpsTest_{op_name} : public ::testing::TestWithParam< ::std::tuple> {{ - protected: - ComputeGraph* graph; - at::ScalarType test_dtype = at::kFloat; - float rtol = {rtol}; - float atol = {atol}; - - void SetUp() override {{ - GraphConfig config; - utils::StorageType default_storage_type; - utils::GPUMemoryLayout default_memory_layout; - std::tie(test_dtype, default_storage_type, default_memory_layout) = GetParam(); - config.set_storage_type_override(default_storage_type); - config.set_memory_layout_override(default_memory_layout); - graph = new ComputeGraph(config); - - if (test_dtype == at::kHalf) {{ - rtol = 1e-2; - atol = 1e-2; - }} - }} - - void TearDown() override {{ - delete graph; - graph = nullptr; - }} - - {check_fn} -}}; -""" - - -class VkTestSuiteGen(TestSuiteGen): - def __init__(self, op_reg_name: str, f: NativeFunction, inputs: VkTestSuite): - super().__init__(f, inputs) - self.op_reg_name = op_reg_name - self.generator = ComputeGraphGen(self.op_reg_name, self.f, self.suite_def) - - def generate_fixture_cpp(self) -> str: - check_fn = "" - if not self.suite_def.requires_prepack: - check_fn = self.generator.gen_op_check_fn() - - prepacked_check_fn = "" - if self.suite_def.supports_prepack(): - self.generator.should_prepack = True - prepacked_check_fn = self.generator.gen_op_check_fn() - check_fn += "\n\n " - check_fn += prepacked_check_fn - - return test_fixture_template.format( - op_name=self.op_name, - check_fn=check_fn, - rtol=self.suite_def.rtol, - atol=self.suite_def.atol, - ) + op_build_graph_fn_body = "" + op_build_graph_fn_body += self.gen_graph_build_code(include_declarations) - def gen_parameterization(self) -> str: - dtypes = self.suite_def.dtypes - storage_types = self.suite_def.storage_types - layouts = self.suite_def.layouts - - return f""" -INSTANTIATE_TEST_SUITE_P( - Combos_{self.op_name}, - GeneratedOpsTest_{self.op_name}, - ::testing::Combine( - ::testing::Values({', '.join(dtypes)}), - ::testing::Values({', '.join(storage_types)}), - ::testing::Values({', '.join(layouts)}))); - """ - - -############################## -## Test File Code Generation ## -############################### - -preamble_str = """ -#include -#include -#include - -#include - -using namespace vkcompute; -using TensorOptions = at::TensorOptions; - -vkapi::ScalarType from_at_scalartype(c10::ScalarType at_scalartype) { - switch (at_scalartype) { - case c10::kFloat: - return vkapi::kFloat; - case c10::kHalf: - return vkapi::kHalf; - case c10::kInt: - return vkapi::kInt; - case c10::kLong: - return vkapi::kInt; - case c10::kChar: - return vkapi::kChar; - default: - VK_THROW("Unsupported at::ScalarType!"); - } -} - -#ifdef USE_VULKAN_FP16_INFERENCE -bool check_close(at::Tensor& t1, at::Tensor& t2, float rtol=1e-2, float atol=1e-2) { -#else -bool check_close(at::Tensor& t1, at::Tensor& t2, float rtol=1e-5, float atol=1e-5) { -#endif - // Skip checking index tensors - if (t1.scalar_type() == at::kLong || t2.scalar_type() == at::kLong) { - return true; - } - bool is_close = at::allclose(t1, t2, rtol, atol); - if (!is_close && t1.numel() < 500) { - std::cout << "reference: " << std::endl; - print(t1, 150); - std::cout << std::endl; - std::cout << "vulkan: " << std::endl; - print(t2, 150); - std::cout << std::endl; - } - return is_close; -} -""" + op_build_graph_fn += op_build_graph_fn_body + op_build_graph_fn += "\n }" + return op_build_graph_fn + def gen_op_exec_graph_fn(self) -> str: + op_name = self.f.func.name.unambiguous_name() + if self.suite_def.test_name_suffix is not None: + op_name += "_" + self.suite_def.test_name_suffix + op_benchmark_fn = self.gen_decl(f"benchmark_{op_name}") + " {\n" + if self.should_prepack: + op_benchmark_fn = self.gen_decl(f"prepacked_benchmark_{op_name}") + " {\n" -class VkCppTestFileGen(CppTestFileGen): - def __init__(self, out_path: str): - super().__init__(out_path) + op_benchmark_fn_body = "" + op_benchmark_fn_body += self.gen_graph_exec_code(False) - def generate_preamble(self) -> str: - return preamble_str + op_benchmark_fn_body = re.sub(r"^", " ", op_benchmark_fn_body, flags=re.M) - def add_suite(self, op_reg_name: str, f: NativeFunction, all_input_cases) -> None: - suites_gen = VkTestSuiteGen(op_reg_name, f, all_input_cases) - self.suites_gens.append(suites_gen) + op_benchmark_fn += op_benchmark_fn_body + op_benchmark_fn += "\n }" + return op_benchmark_fn diff --git a/backends/vulkan/test/op_tests/utils/codegen_base.py b/backends/vulkan/test/op_tests/utils/gen_correctness_base.py similarity index 87% rename from backends/vulkan/test/op_tests/utils/codegen_base.py rename to backends/vulkan/test/op_tests/utils/gen_correctness_base.py index 1ebebe699a0..def3508a8a7 100644 --- a/backends/vulkan/test/op_tests/utils/codegen_base.py +++ b/backends/vulkan/test/op_tests/utils/gen_correctness_base.py @@ -7,62 +7,31 @@ import re from typing import Any, List +from executorch.backends.vulkan.test.op_tests.utils.aten_types import ( + AT_INT_ARRAY_REF, + AT_SCALAR, + AT_TENSOR, + AT_TENSOR_LIST, + BOOL, + DOUBLE, + INT, + OPT_AT_DOUBLE_ARRAY_REF, + OPT_AT_INT_ARRAY_REF, + OPT_AT_TENSOR, + OPT_BOOL, + OPT_DEVICE, + OPT_INT64, + OPT_LAYOUT, + OPT_MEMORY_FORMAT, + OPT_SCALAR_TYPE, + STRING, +) +from executorch.backends.vulkan.test.op_tests.utils.test_suite import TestSuite + from torchgen.api import cpp from torchgen.api.types import CppSignatureGroup from torchgen.model import Argument, NativeFunction -######################## -## ATen code patterns ## -######################## - -AT_INT_ARRAY_REF = "at::IntArrayRef" -AT_SCALAR = "at::Scalar" -AT_TENSOR = "at::Tensor" -AT_TENSOR_LIST = "at::TensorList" -BOOL = "bool" -DOUBLE = "double" -INT = "int64_t" -OPT_AT_DOUBLE_ARRAY_REF = "::std::optional>" -OPT_AT_INT_ARRAY_REF = "at::OptionalIntArrayRef" -OPT_AT_TENSOR = "::std::optional" -OPT_BOOL = "::std::optional" -OPT_INT64 = "::std::optional" -OPT_DEVICE = "::std::optional" -OPT_LAYOUT = "::std::optional" -OPT_MEMORY_FORMAT = "::std::optional" -OPT_SCALAR_TYPE = "::std::optional" -STRING = "c10::string_view" -TWO_TENSOR_TUPLE = "::std::tuple" -THREE_TENSOR_TUPLE = "::std::tuple" -TENSOR_VECTOR = "::std::vector" - -########################### -## Test Suite definition ## -########################### - - -class TestSuite: - def __init__(self, input_cases: List[Any]): - self.input_cases: List[Any] = input_cases - self.prepacked_args: List[str] = [] - self.requires_prepack: bool = False - self.dtypes: List[str] = ["at::kFloat", "at::kHalf"] - - self.data_gen: str = "make_rand_tensor" - self.data_range = (0, 1) - - self.arg_dtype = {} - self.arg_data_range = {} - - self.atol: str = "1e-5" - self.rtol: str = "1e-5" - - self.is_view_op: bool = False - - def supports_prepack(self): - return len(self.prepacked_args) > 0 - - ########################## ## Test Suite Generation ## ########################## @@ -105,13 +74,13 @@ def get_or_return_default(arg: Argument, inputs: List[Any], i: int): return arg.default -class TestSuiteGen: - backend_key = None - +class CorrectnessTestGen: def __init__(self, f: NativeFunction, test_suite: TestSuite): self.f = f self.suite_def = test_suite self.op_name = f.func.name.unambiguous_name() + if test_suite.test_name_suffix is not None: + self.op_name += f"_{test_suite.test_name_suffix}" self.f_sig = CppSignatureGroup.from_native_function( self.f, method=False, fallback_binding=self.f.manual_cpp_binding @@ -379,7 +348,7 @@ def generate_suite_cpp(self) -> str: """ -class CppTestFileGen: +class CorrectnessTestFileGen: def __init__(self, out_path): self.out_path = out_path self.suites_gens = [] @@ -397,5 +366,5 @@ def generate_test_suites_cpp(self) -> str: return "\n".join([h.generate_suite_cpp() for h in self.suites_gens]) def add_suite(self, op_reg_name: str, f: NativeFunction, all_input_cases) -> None: - suites_gen = TestSuiteGen(f, all_input_cases) + suites_gen = CorrectnessTestGen(f, all_input_cases) self.suites_gens.append(suites_gen) diff --git a/backends/vulkan/test/op_tests/utils/gen_correctness_vk.py b/backends/vulkan/test/op_tests/utils/gen_correctness_vk.py new file mode 100644 index 00000000000..6c165a777db --- /dev/null +++ b/backends/vulkan/test/op_tests/utils/gen_correctness_vk.py @@ -0,0 +1,159 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from executorch.backends.vulkan.test.op_tests.utils.gen_computegraph import ( + ComputeGraphGen, +) +from executorch.backends.vulkan.test.op_tests.utils.gen_correctness_base import ( + CorrectnessTestFileGen, + CorrectnessTestGen, +) +from executorch.backends.vulkan.test.op_tests.utils.test_suite import VkTestSuite + +from torchgen.model import NativeFunction + +################################## +## Test Fixture Code Generation ## +################################## + +test_fixture_template = """ +class GeneratedOpsTest_{op_name} : public ::testing::TestWithParam< ::std::tuple> {{ + protected: + ComputeGraph* graph; + at::ScalarType test_dtype = at::kFloat; + float rtol = {rtol}; + float atol = {atol}; + + void SetUp() override {{ + GraphConfig config; + utils::StorageType default_storage_type; + utils::GPUMemoryLayout default_memory_layout; + std::tie(test_dtype, default_storage_type, default_memory_layout) = GetParam(); + config.set_storage_type_override(default_storage_type); + config.set_memory_layout_override(default_memory_layout); + graph = new ComputeGraph(config); + + if (test_dtype == at::kHalf) {{ + rtol = 1e-2; + atol = 1e-2; + }} + }} + + void TearDown() override {{ + delete graph; + graph = nullptr; + }} + + {check_fn} +}}; +""" + + +class VkCorrectnessTestGen(CorrectnessTestGen): + def __init__(self, op_reg_name: str, f: NativeFunction, inputs: VkTestSuite): + super().__init__(f, inputs) + self.op_reg_name = op_reg_name + self.generator = ComputeGraphGen(self.op_reg_name, self.f, self.suite_def) + + def generate_fixture_cpp(self) -> str: + check_fn = "" + if not self.suite_def.requires_prepack: + check_fn = self.generator.gen_op_check_fn() + + prepacked_check_fn = "" + if self.suite_def.supports_prepack(): + self.generator.should_prepack = True + prepacked_check_fn = self.generator.gen_op_check_fn() + check_fn += "\n\n " + check_fn += prepacked_check_fn + + return test_fixture_template.format( + op_name=self.op_name, + check_fn=check_fn, + rtol=self.suite_def.rtol, + atol=self.suite_def.atol, + ) + + def gen_parameterization(self) -> str: + dtypes = self.suite_def.dtypes + storage_types = self.suite_def.storage_types + layouts = self.suite_def.layouts + + return f""" +INSTANTIATE_TEST_SUITE_P( + Combos_{self.op_name}, + GeneratedOpsTest_{self.op_name}, + ::testing::Combine( + ::testing::Values({', '.join(dtypes)}), + ::testing::Values({', '.join(storage_types)}), + ::testing::Values({', '.join(layouts)}))); + """ + + +############################## +## Test File Code Generation ## +############################### + +preamble_str = """ +#include +#include +#include + +#include + +using namespace vkcompute; +using TensorOptions = at::TensorOptions; + +vkapi::ScalarType from_at_scalartype(c10::ScalarType at_scalartype) { + switch (at_scalartype) { + case c10::kFloat: + return vkapi::kFloat; + case c10::kHalf: + return vkapi::kHalf; + case c10::kInt: + return vkapi::kInt; + case c10::kLong: + return vkapi::kInt; + case c10::kChar: + return vkapi::kChar; + default: + VK_THROW("Unsupported at::ScalarType!"); + } +} + +#ifdef USE_VULKAN_FP16_INFERENCE +bool check_close(at::Tensor& t1, at::Tensor& t2, float rtol=1e-2, float atol=1e-2) { +#else +bool check_close(at::Tensor& t1, at::Tensor& t2, float rtol=1e-5, float atol=1e-5) { +#endif + // Skip checking index tensors + if (t1.scalar_type() == at::kLong || t2.scalar_type() == at::kLong) { + return true; + } + bool is_close = at::allclose(t1, t2, rtol, atol); + if (!is_close && t1.numel() < 500) { + std::cout << "reference: " << std::endl; + print(t1, 150); + std::cout << std::endl; + std::cout << "vulkan: " << std::endl; + print(t2, 150); + std::cout << std::endl; + } + return is_close; +} +""" + + +class VkCorrectnessTestFileGen(CorrectnessTestFileGen): + def __init__(self, out_path: str): + super().__init__(out_path) + + def generate_preamble(self) -> str: + return preamble_str + + def add_suite(self, op_reg_name: str, f: NativeFunction, all_input_cases) -> None: + suites_gen = VkCorrectnessTestGen(op_reg_name, f, all_input_cases) + self.suites_gens.append(suites_gen) diff --git a/backends/vulkan/test/op_tests/utils/test_suite.py b/backends/vulkan/test/op_tests/utils/test_suite.py new file mode 100644 index 00000000000..dd01bdde3a4 --- /dev/null +++ b/backends/vulkan/test/op_tests/utils/test_suite.py @@ -0,0 +1,49 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from dataclasses import dataclass +from typing import Any, List, Optional + +################################### +## Generic Test Suite definition ## +################################### + + +class TestSuite: + def __init__(self, input_cases: List[Any]): + self.input_cases: List[Any] = input_cases + self.prepacked_args: List[str] = [] + self.requires_prepack: bool = False + self.dtypes: List[str] = ["at::kFloat", "at::kHalf"] + + self.data_gen: str = "make_rand_tensor" + self.data_range = (0, 1) + + self.arg_dtype = {} + self.arg_data_range = {} + + self.atol: str = "1e-5" + self.rtol: str = "1e-5" + + self.is_view_op: bool = False + self.test_name_suffix: Optional[str] = None + + def supports_prepack(self): + return len(self.prepacked_args) > 0 + + +################################## +## Vulkan Test Suite Definition ## +################################## + + +@dataclass +class VkTestSuite(TestSuite): + def __init__(self, input_cases: List[Any]): + super().__init__(input_cases) + self.storage_types: List[str] = ["utils::kTexture3D"] + self.layouts: List[str] = ["utils::kChannelsPacked"] + self.data_gen: str = "make_rand_tensor" diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py index 9f57ec49a89..e6ddf1cdb86 100644 --- a/backends/vulkan/test/test_vulkan_delegate.py +++ b/backends/vulkan/test/test_vulkan_delegate.py @@ -204,6 +204,16 @@ def forward(self, x, y, w): self.lower_module_and_test_output(add_module, sample_inputs) + sample_inputs = ( + torch.rand(size=(4, 5, 2, 3), dtype=torch.float32), + torch.rand(size=(4, 5, 2, 3), dtype=torch.float32), + torch.rand( + size=(2, 3), dtype=torch.float32 + ), # test broadcasting on packed dim + ) + + self.lower_module_and_test_output(add_module, sample_inputs) + def test_vulkan_backend_add_int(self): class AddIntModule(torch.nn.Module): def __init__(self): @@ -1633,6 +1643,42 @@ def forward(self, x): memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED], ) + def test_vulkan_backend_conv_with_clamp(self): + class ConvWithClampModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.weight = torch.randn(6, 8, 3, 3) + self.bias = torch.randn(8) + self.stride = (1, 2) + self.padding = (2, 3) + self.dilation = (1, 1) + self.transposed = True + self.output_padding = (0, 1) + self.groups = 1 + self.output_min = 0 + self.output_max = 10 + + def forward(self, x): + return torch.ops.et_vk.conv_with_clamp( + x, + self.weight, + self.bias, + self.stride, + self.padding, + self.dilation, + self.transposed, + self.output_padding, + self.groups, + self.output_min, + self.output_max, + ) + + self.lower_module_and_test_output( + ConvWithClampModule(), + (torch.randn(size=(1, 6, 40, 50), dtype=torch.float32),), + memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED], + ) + def test_vulkan_backend_grid_priors(self): class GridPriorsModule(torch.nn.Module): def __init__(self): diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp index 6c056cc9d90..86e9cfc5d57 100644 --- a/backends/vulkan/test/utils/test_utils.cpp +++ b/backends/vulkan/test/utils/test_utils.cpp @@ -8,13 +8,15 @@ #include -#include +#include #include #include #include +using namespace vkcompute; + // // Operator Recording Functions // @@ -68,15 +70,14 @@ void record_nchw_to_image_op( vkapi::VulkanBuffer& src_buffer, api::vTensor& v_dst) { vkapi::PipelineBarrier pipeline_barrier{}; - vkapi::SpecVarList specialization_constants = { - SV(v_dst.packed_dim_whcn_idx())}; + vkapi::SpecVarList specialization_constants = {SV(v_dst.packed_dim())}; context->submit_compute_job( get_nchw_to_tensor_shader( v_dst, context->adapter_ptr()->has_full_int8_buffers_support()), pipeline_barrier, - v_dst.image_extents(), - adaptive_work_group_size(v_dst.image_extents()), + v_dst.logical_limits(), + adaptive_work_group_size(v_dst.logical_limits()), specialization_constants, VK_NULL_HANDLE, 0, @@ -85,7 +86,8 @@ void record_nchw_to_image_op( vkapi::PipelineStage::COMPUTE, vkapi::MemoryAccessType::WRITE), src_buffer, - v_dst.sizes_ubo()); + v_dst.sizes_ubo(), + v_dst.axis_map_ubo()); } void record_image_to_nchw_op( @@ -93,26 +95,26 @@ void record_image_to_nchw_op( api::vTensor& v_src, vkapi::VulkanBuffer& dst_buffer) { vkapi::PipelineBarrier pipeline_barrier{}; - vkapi::SpecVarList specialization_constants = { - SV(v_src.packed_dim_whcn_idx())}; + vkapi::SpecVarList specialization_constants = {SV(v_src.packed_dim())}; context->submit_compute_job( get_tensor_to_nchw_shader(v_src), pipeline_barrier, - v_src.image_extents(), - adaptive_work_group_size(v_src.image_extents()), + v_src.logical_limits(), + adaptive_work_group_size(v_src.logical_limits()), specialization_constants, VK_NULL_HANDLE, 0, dst_buffer, v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE), - v_src.sizes_ubo()); + v_src.sizes_ubo(), + v_src.axis_map_ubo()); } void record_int8_image_to_nchw_noint8_op( api::Context* const context, api::vTensor& v_src, - api::StorageBuffer& dst_buffer) { + api::StagingBuffer& dst_buffer) { vkapi::PipelineBarrier pipeline_barrier{}; uint32_t buffer_len = utils::safe_downcast(dst_buffer.numel() / 4); utils::uvec3 global_wg_size = {buffer_len, 1, 1}; @@ -121,12 +123,13 @@ void record_int8_image_to_nchw_noint8_op( pipeline_barrier, global_wg_size, adaptive_work_group_size(global_wg_size), - {v_src.packed_dim_whcn_idx()}, + {v_src.packed_dim()}, VK_NULL_HANDLE, 0, dst_buffer.buffer(), v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE), v_src.sizes_ubo(), + v_src.axis_map_ubo(), v_src.numel_ubo()); } @@ -155,8 +158,8 @@ void record_conv2d_prepack_weights_op( context->submit_compute_job( shader, pipeline_barrier, - v_dst.image_extents(), - adaptive_work_group_size(v_dst.image_extents()), + v_dst.logical_limits(), + adaptive_work_group_size(v_dst.logical_limits()), specialization_constants, VK_NULL_HANDLE, 0, @@ -183,8 +186,8 @@ void record_binary_op( context->submit_compute_job( VK_KERNEL_FROM_STR(kernel_name), pipeline_barrier, - v_dst.image_extents(), - adaptive_work_group_size(v_dst.image_extents()), + v_dst.logical_limits(), + adaptive_work_group_size(v_dst.logical_limits()), specialization_constants, VK_NULL_HANDLE, 0, @@ -311,6 +314,42 @@ void record_reference_matmul( mat2.strides_ubo()); } +void record_matmul_texture3d( + api::Context* context, + api::vTensor& out, + api::vTensor& mat1, + api::vTensor& mat2) { + std::string kernel_name = "matmul_naive"; + kernel_name.reserve(kShaderNameReserve); + add_storage_type_suffix(kernel_name, out.storage_type()); + add_dtype_suffix(kernel_name, out.dtype()); + + utils::uvec3 global_wg_size = out.logical_limits(); + + vkapi::PipelineBarrier pipeline_barrier{}; + api::context()->submit_compute_job( + VK_KERNEL_FROM_STR(kernel_name), + pipeline_barrier, + global_wg_size, + {8, 8, 1}, + {out.packed_dim(), mat1.packed_dim(), mat2.packed_dim()}, + VK_NULL_HANDLE, + 0, + out.image( + pipeline_barrier, + vkapi::PipelineStage::COMPUTE, + vkapi::MemoryAccessType::WRITE), + mat1.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE), + mat2.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE), + out.sizes_ubo(), + out.logical_limits_ubo(), + out.axis_map_ubo(), + mat1.sizes_ubo(), + mat1.axis_map_ubo(), + mat2.sizes_ubo(), + mat2.axis_map_ubo()); +} + // // Input & Output Utilities // @@ -319,22 +358,22 @@ void record_reference_matmul( _(uint8_t, Byte) \ _(int8_t, Char) \ _(int32_t, Int) \ - _(torch::executor::Half, Half) \ + _(exec_aten::Half, Half) \ _(float, Float) \ _(int8_t, QInt8) void fill_vtensor(api::vTensor& vten, std::vector& data) { - api::StorageBuffer staging_buffer(api::context(), vten.dtype(), data.size()); - -#define CASE(ctype, name) \ - case vkapi::ScalarType::name: { \ - std::vector data_converted; \ - data_converted.resize(data.size()); \ - for (int i = 0; i < data.size(); ++i) { \ - data_converted[i] = ctype(data[i]); \ - } \ - copy_ptr_to_staging( \ - data_converted.data(), staging_buffer, vten.staging_buffer_nbytes()); \ + api::StagingBuffer staging_buffer(api::context(), vten.dtype(), data.size()); + +#define CASE(ctype, name) \ + case vkapi::ScalarType::name: { \ + std::vector data_converted; \ + data_converted.resize(data.size()); \ + for (int i = 0; i < data.size(); ++i) { \ + data_converted[i] = ctype(data[i]); \ + } \ + staging_buffer.copy_from( \ + data_converted.data(), vten.staging_buffer_nbytes()); \ } break; switch (vten.dtype()) { @@ -377,6 +416,20 @@ std::vector create_random_float_buffer( return data; } +std::vector create_random_uint8_buffer( + const size_t numel, + const uint8_t min, + const uint8_t max) { + std::vector data(numel); + std::default_random_engine rng; + std::uniform_real_distribution dist(min, max); + + for (size_t i = 0; i < data.size(); ++i) { + data[i] = (uint8_t)dist(rng); + } + return data; +} + void fill_vtensor( ComputeGraph& graph, const IOValueRef idx, @@ -397,7 +450,7 @@ void fill_vtensor( } void extract_vtensor(api::vTensor& vten, std::vector& data) { - api::StorageBuffer staging_buffer( + api::StagingBuffer staging_buffer( api::context(), vten.dtype(), vten.staging_buffer_numel()); if (vten.storage_type() == utils::StorageType::BUFFER) { @@ -410,14 +463,14 @@ void extract_vtensor(api::vTensor& vten, std::vector& data) { api::context()->submit_cmd_to_gpu(fence.get_submit_handle()); fence.wait(); -#define CASE(ctype, name) \ - case vkapi::ScalarType::name: { \ - std::vector data_converted(data.size()); \ - copy_staging_to_ptr( \ - staging_buffer, data_converted.data(), vten.staging_buffer_nbytes()); \ - for (int i = 0; i < data.size(); ++i) { \ - data[i] = float(data_converted[i]); \ - } \ +#define CASE(ctype, name) \ + case vkapi::ScalarType::name: { \ + std::vector data_converted(data.size()); \ + staging_buffer.copy_to( \ + data_converted.data(), vten.staging_buffer_nbytes()); \ + for (int i = 0; i < data.size(); ++i) { \ + data[i] = float(data_converted[i]); \ + } \ } break; switch (vten.dtype()) { @@ -440,8 +493,10 @@ void submit_to_gpu() { } vkapi::Allocation allocate_memory_for(const api::vTensor& vten) { + VmaAllocationCreateInfo alloc_create_info = + api::context()->adapter_ptr()->vma().gpuonly_resource_create_info(); return api::context()->adapter_ptr()->vma().create_allocation( - vten.get_memory_requirements(), vten.get_allocation_create_info()); + vten.get_memory_requirements(), alloc_create_info); } VmaTotalStatistics get_vma_stats() { diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h index bf549446170..d9d83a9620f 100644 --- a/backends/vulkan/test/utils/test_utils.h +++ b/backends/vulkan/test/utils/test_utils.h @@ -16,11 +16,9 @@ #include #include -using namespace vkcompute; - #define CREATE_FLOAT_TEXTURE(sizes, allocate_memory) \ - api::vTensor( \ - api::context(), \ + vkcompute::api::vTensor( \ + vkcompute::api::context(), \ sizes, \ vkapi::kFloat, \ utils::StorageType::TEXTURE_3D, \ @@ -28,25 +26,29 @@ using namespace vkcompute; allocate_memory); #define CREATE_FLOAT_BUFFER(sizes, allocate_memory) \ - api::vTensor( \ - api::context(), \ + vkcompute::api::vTensor( \ + vkcompute::api::context(), \ sizes, \ vkapi::kFloat, \ utils::StorageType::BUFFER, \ utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, \ allocate_memory); -#define DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(tensor) \ - api::StorageBuffer staging_buffer_##tensor( \ - api::context(), vkapi::kFloat, tensor.staging_buffer_numel()); \ - record_nchw_to_image_op( \ - api::context(), staging_buffer_##tensor.buffer(), tensor); - -#define DEFINE_STAGING_BUFFER_AND_RECORD_FROM_GPU_FOR(tensor) \ - api::StorageBuffer staging_buffer_##tensor( \ - api::context(), vkapi::kFloat, tensor.staging_buffer_numel()); \ - record_image_to_nchw_op( \ - api::context(), tensor, staging_buffer_##tensor.buffer()); +#define DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(tensor) \ + vkcompute::api::StagingBuffer staging_buffer_##tensor( \ + vkcompute::api::context(), \ + vkapi::kFloat, \ + tensor.staging_buffer_numel()); \ + record_nchw_to_image_op( \ + vkcompute::api::context(), staging_buffer_##tensor.buffer(), tensor); + +#define DEFINE_STAGING_BUFFER_AND_RECORD_FROM_GPU_FOR(tensor) \ + vkcompute::api::StagingBuffer staging_buffer_##tensor( \ + vkcompute::api::context(), \ + vkapi::kFloat, \ + tensor.staging_buffer_numel()); \ + record_image_to_nchw_op( \ + vkcompute::api::context(), tensor, staging_buffer_##tensor.buffer()); #define CHECK_VALUE(data, idx, expected) \ do { \ @@ -63,108 +65,125 @@ using namespace vkcompute; // void record_nchw_to_buffer_op( - api::Context* const context, - vkapi::VulkanBuffer& src_buffer, - api::vTensor& v_dst); + vkcompute::api::Context* const context, + vkcompute::vkapi::VulkanBuffer& src_buffer, + vkcompute::api::vTensor& v_dst); void record_buffer_to_nchw_op( - api::Context* const context, - api::vTensor& v_src, - vkapi::VulkanBuffer& dst_buffer); + vkcompute::api::Context* const context, + vkcompute::api::vTensor& v_src, + vkcompute::vkapi::VulkanBuffer& dst_buffer); void record_nchw_to_image_op( - api::Context* const context, - vkapi::VulkanBuffer& src_buffer, - api::vTensor& v_dst); + vkcompute::api::Context* const context, + vkcompute::vkapi::VulkanBuffer& src_buffer, + vkcompute::api::vTensor& v_dst); void record_image_to_nchw_op( - api::Context* const context, - api::vTensor& v_src, - vkapi::VulkanBuffer& dst_buffer); + vkcompute::api::Context* const context, + vkcompute::api::vTensor& v_src, + vkcompute::vkapi::VulkanBuffer& dst_buffer); void record_int8_image_to_nchw_noint8_op( - api::Context* const context, - api::vTensor& v_src, - api::StorageBuffer& dst_buffer); + vkcompute::api::Context* const context, + vkcompute::api::vTensor& v_src, + vkcompute::api::StagingBuffer& dst_buffer); void record_conv2d_prepack_weights_op( - api::Context* const context, - vkapi::VulkanBuffer& src_buffer, - api::vTensor& v_dst, + vkcompute::api::Context* const context, + vkcompute::vkapi::VulkanBuffer& src_buffer, + vkcompute::api::vTensor& v_dst, const std::vector& original_sizes, const bool transposed); void record_binary_op( - api::Context* const context, + vkcompute::api::Context* const context, const std::string& op_name, - api::vTensor& v_in1, - api::vTensor& v_in2, - api::vTensor& v_dst); + vkcompute::api::vTensor& v_in1, + vkcompute::api::vTensor& v_in2, + vkcompute::api::vTensor& v_dst); void execute_and_check_add( - api::vTensor& a, - api::vTensor& b, - api::vTensor& c, + vkcompute::api::vTensor& a, + vkcompute::api::vTensor& b, + vkcompute::api::vTensor& c, float a_val, float b_val); -void record_index_fill_buffer(api::Context* const context, api::vTensor& v_ten); +void record_index_fill_buffer( + vkcompute::api::Context* const context, + vkcompute::api::vTensor& v_ten); void record_scalar_add_buffer( - api::Context* context, - api::vTensor& v_ten, + vkcompute::api::Context* context, + vkcompute::api::vTensor& v_ten, float offset); void record_reference_matmul( - api::Context* context, - api::vTensor& out, - api::vTensor& mat1, - api::vTensor& mat2); + vkcompute::api::Context* context, + vkcompute::api::vTensor& out, + vkcompute::api::vTensor& mat1, + vkcompute::api::vTensor& mat2); + +void record_matmul_texture3d( + vkcompute::api::Context* context, + vkcompute::api::vTensor& out, + vkcompute::api::vTensor& mat1, + vkcompute::api::vTensor& mat2); // // Input & Output Utilities // -inline void -fill_staging(api::StorageBuffer& staging, float val, int numel = -1) { +inline void fill_staging( + vkcompute::api::StagingBuffer& staging, + float val, + int numel = -1) { if (numel < 0) { numel = staging.numel(); } std::vector data(numel); std::fill(data.begin(), data.end(), val); - copy_ptr_to_staging(data.data(), staging, sizeof(float) * numel); + staging.copy_from(data.data(), sizeof(float) * numel); } -void fill_vtensor(api::vTensor& vten, std::vector& data); +void fill_vtensor(vkcompute::api::vTensor& vten, std::vector& data); -void fill_vtensor(api::vTensor& vten, float val, bool iota = false); +void fill_vtensor(vkcompute::api::vTensor& vten, float val, bool iota = false); std::vector create_random_float_buffer( const size_t numel, const float min = 0, const float max = 1); +std::vector create_random_uint8_buffer( + const size_t numel, + const uint8_t min = 0, + const uint8_t max = 255); + void fill_vtensor( - ComputeGraph& graph, - const IOValueRef idx, + vkcompute::ComputeGraph& graph, + const vkcompute::IOValueRef idx, float val, bool iota = false); -void extract_vtensor(api::vTensor& vten, std::vector& data); +void extract_vtensor(vkcompute::api::vTensor& vten, std::vector& data); -inline std::vector extract_vtensor(api::vTensor& vten) { +inline std::vector extract_vtensor(vkcompute::api::vTensor& vten) { std::vector data_out(vten.staging_buffer_numel()); extract_vtensor(vten, data_out); return data_out; } -inline void -check_staging_buffer(api::StorageBuffer& staging, float val, int numel = -1) { +inline void check_staging_buffer( + vkcompute::api::StagingBuffer& staging, + float val, + int numel = -1) { if (numel < 0) { numel = staging.numel(); } std::vector data(numel); - copy_staging_to_ptr(staging, data.data(), sizeof(float) * numel); + staging.copy_to(data.data(), sizeof(float) * numel); for (size_t i = 0; i < data.size(); ++i) { CHECK_VALUE(data, i, val); @@ -172,21 +191,21 @@ check_staging_buffer(api::StorageBuffer& staging, float val, int numel = -1) { } inline int64_t get_buf_idx( - ComputeGraph& graph, - IOValueRef ref, + vkcompute::ComputeGraph& graph, + vkcompute::IOValueRef ref, const std::vector& tensor_coor) { - vTensorPtr vten_ptr = graph.get_tensor(ref.value); + vkcompute::vTensorPtr vten_ptr = graph.get_tensor(ref.value); const std::vector& sizes = vten_ptr->sizes(); - int64_t c = dim_at(sizes); - int64_t h = dim_at(sizes); - int64_t w = dim_at(sizes); + int64_t c = vkcompute::dim_at(sizes); + int64_t h = vkcompute::dim_at(sizes); + int64_t w = vkcompute::dim_at(sizes); - int64_t ni = dim_at(tensor_coor); - int64_t ci = dim_at(tensor_coor); - int64_t hi = dim_at(tensor_coor); - int64_t wi = dim_at(tensor_coor); + int64_t ni = vkcompute::dim_at(tensor_coor); + int64_t ci = vkcompute::dim_at(tensor_coor); + int64_t hi = vkcompute::dim_at(tensor_coor); + int64_t wi = vkcompute::dim_at(tensor_coor); return (ni * c * h * w + ci * h * w + hi * w + wi); } @@ -197,7 +216,8 @@ inline int64_t get_buf_idx( void submit_to_gpu(); -vkapi::Allocation allocate_memory_for(const api::vTensor& vten); +vkcompute::vkapi::Allocation allocate_memory_for( + const vkcompute::api::vTensor& vten); VmaTotalStatistics get_vma_stats(); @@ -208,7 +228,7 @@ size_t get_vma_allocation_count(); // void execute_graph_and_check_output( - ComputeGraph& graph, + vkcompute::ComputeGraph& graph, std::vector input_vals, std::vector expected_outputs); diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index 157f995ab4c..9a99b11f758 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -11,7 +11,7 @@ #include #include -#include +#include #include @@ -21,8 +21,11 @@ #include +#include + #include +using namespace vkcompute; using namespace vkcompute::api; std::vector @@ -177,57 +180,32 @@ TEST_F(VulkanComputeAPITest, empty_init_shader_info_test) { TEST_F(VulkanComputeAPITest, calculate_dim_order_test) { // ndim, GPUMemoryLayout, expected dim order pairs - std::vector>> - test_cases = { - {1, utils::kWidthPacked, {0}}, - {1, utils::kHeightPacked, {0}}, - {1, utils::kChannelsPacked, {0}}, - {2, utils::kWidthPacked, {0, 1}}, - {2, utils::kHeightPacked, {1, 0}}, - {2, utils::kChannelsPacked, {0, 1}}, - {3, utils::kWidthPacked, {0, 1, 2}}, - {3, utils::kHeightPacked, {0, 2, 1}}, - {3, utils::kChannelsPacked, {1, 2, 0}}, - {4, utils::kWidthPacked, {0, 1, 2, 3}}, - {4, utils::kHeightPacked, {0, 1, 3, 2}}, - {4, utils::kChannelsPacked, {0, 2, 3, 1}}, - }; + std::vector>> test_cases = { + {1, WHCN::kWidthDim, {0}}, + {1, WHCN::kHeightDim, {0}}, + {1, WHCN::kChannelsDim, {0}}, + {2, WHCN::kWidthDim, {0, 1}}, + {2, WHCN::kHeightDim, {1, 0}}, + {2, WHCN::kChannelsDim, {0, 1}}, + {3, WHCN::kWidthDim, {0, 1, 2}}, + {3, WHCN::kHeightDim, {0, 2, 1}}, + {3, WHCN::kChannelsDim, {1, 2, 0}}, + {4, WHCN::kWidthDim, {0, 1, 2, 3}}, + {4, WHCN::kHeightDim, {0, 1, 3, 2}}, + {4, WHCN::kChannelsDim, {0, 2, 3, 1}}, + }; for (const auto& test_case : test_cases) { const size_t& ndim = std::get<0>(test_case); - const utils::GPUMemoryLayout& layout = std::get<1>(test_case); + const int32_t packed_dim = std::get<1>(test_case); const auto& expected_dim_order = std::get<2>(test_case); - std::vector dim_order = calculate_dim_order(ndim, layout); - - ASSERT_TRUE(dim_order == expected_dim_order); - } -} - -TEST_F(VulkanComputeAPITest, calculate_tensor_dim_order_test) { - // Stride, expected dim order pairs. Note that strides don't have to "make - // sense" because only they are sorted; the actual stride values don't matter. - std::vector, std::vector>> - test_cases = { - {{8, 1}, {0, 1}}, - {{2, 10}, {1, 0}}, - {{66, 12, 1}, {0, 1, 2}}, - {{32, 128, 4}, {1, 0, 2}}, - {{3, 8, 11, 212}, {3, 2, 1, 0}}, - {{100, 12, 9, 1}, {0, 1, 2, 3}}, - {{10, 12, 101, 6}, {2, 1, 0, 3}}, - }; - - for (const auto& test_case : test_cases) { - const auto& strides = std::get<0>(test_case); - const auto& expected_dim_order = std::get<1>(test_case); - std::vector dim_order = strides_to_dim_order(strides); + std::vector dim_order = calculate_dim_order(ndim, packed_dim); ASSERT_TRUE(dim_order == expected_dim_order); } } TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) { - // vtensor to be resized vTensor v_tensor_to_resize( context(), {25, 25, 25, 25}, @@ -243,8 +221,9 @@ TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) { for (const auto& layout : {utils::kWidthPacked, utils::kHeightPacked, utils::kChannelsPacked}) { { + const int32_t packed_dim = static_cast(layout); std::vector dim_order = - calculate_dim_order(sizes.size(), layout); + calculate_dim_order(sizes.size(), packed_dim); std::vector strides = calculate_strides(sizes, dim_order); std::vector ref_strides = get_reference_strides(sizes, layout); ASSERT_TRUE(strides == ref_strides); @@ -280,26 +259,112 @@ TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) { } } +TEST_F(VulkanComputeAPITest, virtual_transpose_test) { + std::vector sizes = {7, 9, 11, 13}; + // (dim0, dim1), new_sizes, new_dim_order, new_axis_map, new_packed_dim_idx + std::vector>> test_cases = { + {{2, 3}, {7, 9, 13, 11}, {0, 1, 3, 2}, {1, 0, 2, 2}, {1}}, + {{2, 1}, {7, 11, 9, 13}, {0, 2, 1, 3}, {0, 2, 1, 1}, {0}}, + {{1, 3}, {7, 13, 11, 9}, {0, 3, 2, 1}, {2, 1, 0, 0}, {2}}, + }; + + for (const auto& test_case : test_cases) { + const int dim0 = test_case.at(0).at(0); + const int dim1 = test_case.at(0).at(1); + + const auto& expected_sizes = test_case.at(1); + const auto& expected_dim_order = test_case.at(2); + const auto& expected_axis_map = test_case.at(3); + const int expected_packed_dim = test_case.at(4).at(0); + + { + vTensor a_buffer = vTensor( + context(), sizes, vkapi::kFloat, utils::kBuffer, utils::kWidthPacked); + + a_buffer.virtual_transpose(dim0, dim1); + EXPECT_TRUE(a_buffer.sizes() == expected_sizes); + EXPECT_TRUE(a_buffer.dim_order() == expected_dim_order); + } + + { + vTensor a_texture = vTensor( + context(), + sizes, + vkapi::kFloat, + utils::kTexture3D, + utils::kWidthPacked); + a_texture.virtual_transpose(dim0, dim1); + EXPECT_TRUE(a_texture.sizes() == expected_sizes); + EXPECT_TRUE(a_texture.axis_map() == expected_axis_map); + EXPECT_TRUE(a_texture.packed_dim() == expected_packed_dim); + } + } +} + +utils::ivec3 make_temp_ivec3(int x, int y, int z) { + return utils::ivec3{x, y, z}; +} + TEST_F(VulkanComputeAPITest, vec_test) { - utils::vec3 v3({1, 2, 3}); - ASSERT_TRUE(v3[0] == 1); - ASSERT_TRUE(v3[1] == 2); - ASSERT_TRUE(v3[2] == 3); - v3 = {4, 5, 6}; - ASSERT_TRUE(v3[0] == 4); - ASSERT_TRUE(v3[1] == 5); - ASSERT_TRUE(v3[2] == 6); - - utils::uvec4 uv4({4, 3, 2, 1}); - ASSERT_TRUE(uv4[0] == 4); - ASSERT_TRUE(uv4[1] == 3); - ASSERT_TRUE(uv4[2] == 2); - ASSERT_TRUE(uv4[3] == 1); - uv4 = {11, 13, 12, 88}; - ASSERT_TRUE(uv4[0] == 11); - ASSERT_TRUE(uv4[1] == 13); - ASSERT_TRUE(uv4[2] == 12); - ASSERT_TRUE(uv4[3] == 88); + { + utils::vec3 v3({1, 2, 3}); + ASSERT_TRUE(v3[0] == 1); + ASSERT_TRUE(v3[1] == 2); + ASSERT_TRUE(v3[2] == 3); + v3 = {4, 5, 6}; + ASSERT_TRUE(v3[0] == 4); + ASSERT_TRUE(v3[1] == 5); + ASSERT_TRUE(v3[2] == 6); + } + + { + utils::uvec4 uv4({4, 3, 2, 1}); + ASSERT_TRUE(uv4[0] == 4); + ASSERT_TRUE(uv4[1] == 3); + ASSERT_TRUE(uv4[2] == 2); + ASSERT_TRUE(uv4[3] == 1); + uv4 = {11, 13, 12, 88}; + ASSERT_TRUE(uv4[0] == 11); + ASSERT_TRUE(uv4[1] == 13); + ASSERT_TRUE(uv4[2] == 12); + ASSERT_TRUE(uv4[3] == 88); + } + + // Test copy from same type + { + utils::ivec3 v{5, 6, 8}; + utils::ivec3 v2 = v; + + ASSERT_TRUE(v2[0] == 5); + ASSERT_TRUE(v2[1] == 6); + ASSERT_TRUE(v2[2] == 8); + } + + // Test copy from different type + { + utils::uvec3 v{5, 6, 8}; + utils::ivec3 v2 = v; + + ASSERT_TRUE(v2[0] == 5); + ASSERT_TRUE(v2[1] == 6); + ASSERT_TRUE(v2[2] == 8); + } + + // Test construction from temporary vec + { + utils::uvec3 v{make_temp_ivec3(4, 5, 10)}; + ASSERT_TRUE(v[0] == 4); + ASSERT_TRUE(v[1] == 5); + ASSERT_TRUE(v[2] == 10); + } + + // Test initalization from temporary vec + { + utils::uvec3 v = make_temp_ivec3(4, 5, 10); + ASSERT_TRUE(v[0] == 4); + ASSERT_TRUE(v[1] == 5); + ASSERT_TRUE(v[2] == 10); + } } TEST_F(VulkanComputeAPITest, retrieve_custom_shader_test) { @@ -358,7 +423,7 @@ TEST_F(VulkanComputeAPITest, spec_var_classes_test) { TEST_F(VulkanComputeAPITest, spec_var_shader_test) { size_t len = 16; - StorageBuffer buffer(context(), vkapi::kFloat, len); + StagingBuffer buffer(context(), vkapi::kFloat, len); float scale = 3.0f; float offset = 1.5f; @@ -382,7 +447,7 @@ TEST_F(VulkanComputeAPITest, spec_var_shader_test) { submit_to_gpu(); std::vector data(len); - copy_staging_to_ptr(buffer, data.data(), buffer.nbytes()); + buffer.copy_to(data.data(), buffer.nbytes()); for (size_t i = 0; i < len; ++i) { CHECK_VALUE(data, i, scale * i + offset); @@ -429,7 +494,7 @@ TEST_F(VulkanComputeAPITest, update_params_between_submit) { params.buffer()); } - StorageBuffer staging_buffer( + StagingBuffer staging_buffer( context(), vkapi::kFloat, a.staging_buffer_numel()); record_image_to_nchw_op(context(), a, staging_buffer.buffer()); @@ -450,7 +515,7 @@ TEST_F(VulkanComputeAPITest, update_params_between_submit) { template void test_storage_buffer_type(const size_t len) { - StorageBuffer buffer(context(), dtype, len); + StagingBuffer buffer(context(), dtype, len); std::string kernel_name("idx_fill_buffer"); switch (dtype) { @@ -492,7 +557,7 @@ void test_storage_buffer_type(const size_t len) { submit_to_gpu(); std::vector data(len); - copy_staging_to_ptr(buffer, data.data(), buffer.nbytes()); + buffer.copy_to(data.data(), buffer.nbytes()); for (size_t i = 0; i < len; ++i) { CHECK_VALUE(data, i, T(i)); @@ -507,7 +572,7 @@ TEST_F(VulkanComputeAPITest, test_buffer_float16) { if (!context()->adapter_ptr()->has_full_float16_buffers_support()) { GTEST_SKIP(); } - test_storage_buffer_type(16); + test_storage_buffer_type(16); } TEST_F(VulkanComputeAPITest, test_buffer_int8) { @@ -589,7 +654,7 @@ TEST_F(VulkanComputeAPITest, buffer_tensor_sanity_check) { run_buffer_tensor_sanity_check(a); break; case vkapi::kHalf: - run_buffer_tensor_sanity_check(a); + run_buffer_tensor_sanity_check(a); break; case vkapi::kChar: run_buffer_tensor_sanity_check(a); @@ -626,26 +691,30 @@ TEST_F(VulkanComputeAPITest, texture_add_sanity_check) { } } -TEST_F(VulkanComputeAPITest, tensor_copy_test) { - std::vector sizes = {9, 9}; - std::vector strides = - get_reference_strides(sizes, utils::kWidthPacked); - std::vector dim_order = {0, 1}; +TEST_F(VulkanComputeAPITest, tensor_alias_test) { + for (utils::StorageType storage_type : {utils::kTexture3D, utils::kBuffer}) { + std::vector sizes = {9, 9}; - vTensor original = CREATE_FLOAT_BUFFER(sizes, /*allocate_memory=*/true); - vTensor copy = vTensor(original, sizes, dim_order); - EXPECT_TRUE(get_vma_allocation_count() == 1); - EXPECT_TRUE(copy.is_view_of(original)); + const size_t alloc_count_before = get_vma_allocation_count(); - // Fill original tensor with some data - fill_vtensor(original, 2.5f, true); + vTensor original = vTensor(context(), sizes, vkapi::kFloat, storage_type); - std::vector data_out(copy.staging_buffer_numel()); - // Extract the copy tensor; should contain the data of the original tensor - extract_vtensor(copy, data_out); + vTensor copy = vTensor(original); - for (size_t i = 0; i < data_out.size(); ++i) { - CHECK_VALUE(data_out, i, 2.5f + i); + // Two tensors but only one additional allocation. + EXPECT_TRUE(get_vma_allocation_count() == alloc_count_before + 1); + EXPECT_TRUE(copy.is_view_of(original)); + + // Fill original tensor with some data + fill_vtensor(original, 2.5f, true); + + std::vector data_out(copy.staging_buffer_numel()); + // Extract the copy tensor; should contain the data of the original tensor + extract_vtensor(copy, data_out); + + for (size_t i = 0; i < original.numel(); ++i) { + CHECK_VALUE(data_out, i, 2.5f + i); + } } } @@ -655,46 +724,58 @@ TEST_F(VulkanComputeAPITest, tensor_no_copy_transpose_test) { constexpr int N = 17; std::vector mat1_sizes = {M, K}; std::vector mat2_sizes = {N, K}; - std::vector mat2_t_sizes = {K, N}; std::vector out_sizes = {M, N}; - std::vector transposed_dim_order = {1, 0}; - - vTensor mat1 = CREATE_FLOAT_BUFFER(mat1_sizes, /*allocate_memory=*/true); - vTensor mat2 = CREATE_FLOAT_BUFFER(mat2_sizes, /*allocate_memory=*/true); - vTensor out = CREATE_FLOAT_BUFFER(out_sizes, /*allocate_memory=*/true); - - // Generate data - std::vector mat1_data = - create_random_float_buffer(mat1.staging_buffer_numel()); - std::vector mat2_data = - create_random_float_buffer(mat2.staging_buffer_numel()); - - // Create direct view and modify sizes and strides later - vTensor mat2_t = vTensor(mat2); - - std::vector mat2_t_data = transpose_matrix(mat2_data, N, K); - std::vector ref_out = - compute_reference_matmul(mat1_data, mat2_t_data, M, K, N); - - // Fill original tensor with some data - fill_vtensor(mat1, mat1_data); - fill_vtensor(mat2, mat2_data); - - record_reference_matmul(api::context(), out, mat1, mat2_t); - - // Update sizes and strides of mat2_t to be that of a transposed tensor - mat2_t.virtual_reconfigure(mat2_t_sizes, transposed_dim_order); - EXPECT_TRUE(mat2_t.gpu_memory_layout() == utils::kHeightPacked); - - std::vector data_out(out.staging_buffer_numel()); - // Extract the copy tensor; should contain the data of the original tensor - extract_vtensor(out, data_out); + for (const auto storage_type : {utils::kTexture3D, utils::kBuffer}) { + vTensor mat1 = vTensor( + context(), + mat1_sizes, + vkapi::kFloat, + storage_type, + utils::kWidthPacked); + vTensor mat2 = vTensor( + context(), + mat2_sizes, + vkapi::kFloat, + storage_type, + utils::kWidthPacked); + vTensor out = vTensor( + context(), out_sizes, vkapi::kFloat, storage_type, utils::kWidthPacked); + + // Generate data + std::vector mat1_data = + create_random_float_buffer(mat1.staging_buffer_numel()); + std::vector mat2_data = + create_random_float_buffer(mat2.staging_buffer_numel()); + + // Create direct view and modify sizes and strides later + vTensor mat2_t = vTensor(mat2); + // Update sizes and strides of mat2_t to be that of a transposed tensor + mat2_t.virtual_transpose(0, 1); + + EXPECT_TRUE(mat2_t.packed_dim() == WHCN::kHeightDim); + + std::vector mat2_t_data = transpose_matrix(mat2_data, N, K); + std::vector ref_out = + compute_reference_matmul(mat1_data, mat2_t_data, M, K, N); + + // Fill original tensor with some data + fill_vtensor(mat1, mat1_data); + fill_vtensor(mat2, mat2_data); + + if (storage_type == utils::kTexture3D) { + record_matmul_texture3d(context(), out, mat1, mat2_t); + } else { + record_reference_matmul(context(), out, mat1, mat2_t); + } - EXPECT_TRUE(data_out.size() == ref_out.size()); + std::vector data_out(out.staging_buffer_numel()); + // Extract the copy tensor; should contain the data of the original tensor + extract_vtensor(out, data_out); - for (size_t i = 0; i < data_out.size(); ++i) { - EXPECT_TRUE(check_close(data_out[i], ref_out[i])); + for (size_t i = 0; i < ref_out.size(); ++i) { + EXPECT_TRUE(check_close(data_out[i], ref_out[i])); + } } } @@ -904,64 +985,6 @@ TEST_F(VulkanComputeAPITest, use_non_bound_textures_fails) { EXPECT_THROW(fill_vtensor(a, data_a), vkapi::Error); } -TEST_F(VulkanComputeAPITest, tensor_reallocation_test) { - std::vector sizes = {4, 4, 1}; - vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true); - vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true); - vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true); - - execute_and_check_add(a, b, c, 3.0f, 5.0f); - - // Redo with new sizes - std::vector new_sizes = {4, 6, 3}; - a.reallocate(new_sizes); - b.reallocate(new_sizes); - c.reallocate(new_sizes); - - // Flush everything - context()->flush(); - - execute_and_check_add(a, b, c, 12.0f, 10.0f); -} - -TEST_F( - VulkanComputeAPITest, - tensor_reallocation_with_deferred_allocation_test) { - std::vector sizes = {8, 8, 8}; - vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false); - vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false); - vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false); - - vkapi::Allocation a_mem = allocate_memory_for(a); - a.image().bind_allocation(a_mem); - vkapi::Allocation b_mem = allocate_memory_for(b); - b.image().bind_allocation(b_mem); - vkapi::Allocation c_mem = allocate_memory_for(c); - c.image().bind_allocation(c_mem); - - execute_and_check_add(a, b, c, 4.0f, 8.0f); - - std::vector> new_sizes_list = { - {4, 3, 5}, {4, 1, 7}, {8, 3, 2}, {8, 7, 2}}; - - for (auto& new_sizes : new_sizes_list) { - // Redo with new sizes - a.reallocate(new_sizes); - b.reallocate(new_sizes); - c.reallocate(new_sizes); - - // Flush everything - context()->flush(); - - a.image().bind_allocation(a_mem); - b.image().bind_allocation(b_mem); - c.image().bind_allocation(c_mem); - - execute_and_check_add( - a, b, c, float(new_sizes[1] + 4.5f), float(new_sizes[2] + 13.0f)); - } -} - TEST_F(VulkanComputeAPITest, texture_virtual_resize) { context()->set_cmd(/*reusable = */ true); std::vector sizes = {8, 12, 12}; @@ -1014,6 +1037,34 @@ TEST_F(VulkanComputeAPITest, texture_virtual_resize) { graph.get_tensor(name.value)->staging_buffer_numel()); \ graph.copy_from_staging(name.staging, data_##name.data(), data_##name.size()); +// The purpose of this test is simply to track the size of various classes over +// time, in the interest of making sure that they doesn't grow too large. +TEST_F(VulkanComputeAPITest, print_object_sizes) { +#define PRINT_SIZE(name) \ + std::cout << #name << " size: " << sizeof(name) << " B" << std::endl + PRINT_SIZE(vTensor); + PRINT_SIZE(Value); + PRINT_SIZE(StagingBuffer); + PRINT_SIZE(ComputeGraph); + PRINT_SIZE(ExecuteNode); +#undef PRINT_SIZE + + // The actual sizes of each object is dependent on the platform. However, we + // can alert ourselves to any significant changes in the sizes of these + // objects by checking the `sizeof()` the class against some loose thresholds. + + // Current known size on 64 bit system: 1040 B + EXPECT_TRUE(sizeof(vTensor) < 1200); + // Current known size on 64 bit system: 1056 B + EXPECT_TRUE(sizeof(Value) < 1200); + // Current known size on 64 bit system: 120 B + EXPECT_TRUE(sizeof(StagingBuffer) < 500); + // Current known size on 64 bit system: 384 B + EXPECT_TRUE(sizeof(ComputeGraph) < 500); + // Current known size on 64 bit system: 248 B + EXPECT_TRUE(sizeof(ExecuteNode) < 500); +} + TEST(VulkanComputeGraphTest, test_values_scalars) { GraphConfig config; ComputeGraph graph(config); @@ -1227,8 +1278,8 @@ TEST(VulkanComputeGraphTest, test_simple_graph) { GraphConfig config; ComputeGraph graph(config); - std::vector size_big = {8, 64, 124}; - std::vector size_small = {8, 1, 124}; + std::vector size_big = {1, 8, 8}; + std::vector size_small = {1, 1, 8}; // Build graph @@ -1268,6 +1319,64 @@ TEST(VulkanComputeGraphTest, test_simple_graph) { } } +TEST(VulkanComputeGraphTest, test_simple_graph_with_symint) { + GraphConfig config; + config.set_storage_type_override(utils::kTexture3D); + ComputeGraph graph(config); + + std::vector sizes = {8, 64, 124}; + + // Build graph + + ValueRef scalar = graph.add_symint(1); + IOValueRef a = graph.add_input_tensor(sizes, vkapi::kFloat); + + IOValueRef out = {}; + out.value = a.value; + + graph.execute_nodes().emplace_back(new ExecuteNode( + graph, + VK_KERNEL_FROM_STR("scalar_add_texture"), + graph.create_global_wg_size(a.value), + graph.create_local_wg_size(a.value), + // Inputs and Outputs + {{out.value, vkapi::MemoryAccessType::WRITE}}, + // Shader params buffers + {graph.logical_limits_ubo(a.value), + graph.get_or_create_int_param_buffer(scalar)}, + // Specialization Constants + {}, + // Resizing Logic + nullptr, + {})); + + out.staging = graph.set_output_tensor(out.value); + + graph.prepare(); + graph.encode_execute(); + + // Run graph + + for (float i = 5.0f; i < 30.0f; i += 10.0f) { + int scalar_val = i - 3.0f; + graph.set_symint(scalar, scalar_val); + + float val_a = i + 2.0f; + float val_out = val_a + scalar_val; + + fill_vtensor(graph, a, val_a); + + graph.execute(); + + EXTRACT_TENSOR(out); + + // Sanity check that the values are correct + for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); ++i) { + CHECK_VALUE(data_out, i, val_out); + } + } +} + #define CREATE_WEIGHT_TENSOR(name, sizes, dtype, val) \ std::vector data_##name(utils::multiply_integers(sizes)); \ std::fill(data_##name.begin(), data_##name.end(), val); \ @@ -1335,6 +1444,7 @@ TEST(VulkanComputeGraphTest, test_simple_prepacked_graph) { TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) { GraphConfig config; ComputeGraph graph(config); + size_t expected_vma_allocation_count = 0; std::vector size_big = {12, 64, 64}; std::vector size_small = {12, 64, 64}; @@ -1351,8 +1461,10 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) { /*shared_object_idx = */ 4); // +2: t.sizes_ubo() for each staging shader + // +2: t.axis_map_ubo() for each staging shader // +2: staging buffer for each input tensor - EXPECT_TRUE(get_vma_allocation_count() == 4); + expected_vma_allocation_count += 6; + EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count); ValueRef c = graph.add_tensor( size_big, @@ -1362,15 +1474,22 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) { auto addFn = VK_GET_OP_FN("aten.add.Tensor"); addFn(graph, {a.value, b.value, kDummyValueRef, c}); + // +2: alpha UBO, broadcast UBO for arithmetic shader + // +1: t.sizes_ubo() for arithmetic shader output c + // +1: t.axis_map_ubo() for arithmetic shader output c + expected_vma_allocation_count += 4; + EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count); + IOValueRef d = graph.add_input_tensor( size_small, vkapi::kFloat, /*shared_object_idx = */ 2); - // +2: alpha UBO, broadcast UBO for arithmetic shader // +1: t.sizes_ubo() uniform buffer for staging shader + // +1: t.axis_map_ubo() uniform buffer for staging shader // +1: staging buffer for the input tensor - EXPECT_TRUE(get_vma_allocation_count() == 9); + expected_vma_allocation_count += 3; + EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count); ValueRef e = graph.add_tensor( size_big, @@ -1380,20 +1499,26 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) { auto mulFn = VK_GET_OP_FN("aten.mul.Tensor"); mulFn(graph, {c, d.value, e}); + // +2: alpha UBO, broadcast UBO for arithmetic shader + // +1: t.sizes_ubo() for arithmetic shader output e + // +1: t.axis_map_ubo() for arithmetic shader output e + expected_vma_allocation_count += 4; + EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count); + IOValueRef out = {}; out.value = e; out.staging = graph.set_output_tensor(out.value); - // +2: alpha UBO, broadcast UBO for arithmetic shader - // +1: t.sizes_ubo() for staging shader - // +1 staging buffer for the input tensor - EXPECT_TRUE(get_vma_allocation_count() == 13); + // +1: staging buffer for the output tensor + expected_vma_allocation_count += 1; + EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count); graph.prepare(); graph.encode_execute(); // +3: shared memory allocations for tensors - EXPECT_TRUE(get_vma_allocation_count() == 16); + expected_vma_allocation_count += 3; + EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count); // Run graph @@ -1460,6 +1585,105 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) { } } +TEST(VulkanComputeGraphTest, test_simple_graph_with_tmp_tensors) { + GraphConfig config; + ComputeGraph graph(config); + + std::vector size_big = {8, 64, 124}; + std::vector size_small = {8, 1, 124}; + + // Build graph + + IOValueRef a = graph.add_input_tensor( + size_big, vkapi::kFloat, /*shared_object_idx = */ 0); + IOValueRef b = graph.add_input_tensor( + size_small, vkapi::kFloat, /*shared_object_idx = */ 1); + + IOValueRef out = {}; + + out.value = + graph.add_tensor(size_big, vkapi::kFloat, /*shared_object_idx = */ 2); + + // Perform the following compute + // + // a, b, out; + // { + // inter; + // { + // tmp = a + b + // tmp2 = tmp + a + // inter = tmp2 + b + // } + // { + // tmp = inter + b; + // tmp2 = tmp + a + // out = tmp2 + b; + // } + // } + { + TmpTensor inter(&graph, size_big, vkapi::kFloat); + EXPECT_TRUE(inter.sobj_idx == 3); + { + TmpTensor tmp(&graph, size_big, vkapi::kFloat); + EXPECT_TRUE(tmp.sobj_idx == 4); + VK_GET_OP_FN("aten.add.Tensor") + (graph, {a, b, kDummyValueRef, tmp}); + + TmpTensor tmp2(&graph, size_big, vkapi::kFloat); + EXPECT_TRUE(tmp2.sobj_idx == 5); + VK_GET_OP_FN("aten.add.Tensor") + (graph, {tmp, a, kDummyValueRef, tmp2}); + + VK_GET_OP_FN("aten.add.Tensor") + (graph, {tmp2, b, kDummyValueRef, inter}); + } + { + TmpTensor tmp(&graph, size_big, vkapi::kFloat); + EXPECT_TRUE(tmp.sobj_idx == 4); + VK_GET_OP_FN("aten.add.Tensor") + (graph, {inter, b, kDummyValueRef, tmp}); + + TmpTensor tmp2(&graph, size_big, vkapi::kFloat); + EXPECT_TRUE(tmp2.sobj_idx == 5); + VK_GET_OP_FN("aten.add.Tensor") + (graph, {tmp, a, kDummyValueRef, tmp2}); + + VK_GET_OP_FN("aten.add.Tensor") + (graph, {tmp2, b, kDummyValueRef, out}); + } + } + + out.staging = graph.set_output_tensor(out.value); + + graph.prepare(); + graph.encode_execute(); + + // Run graph + + for (float i = 5.0f; i < 30.0f; i += 10.0f) { + float val_a = i + 2.0f; + float val_b = i + 1.5f; + float val_tmp = val_a + val_b; + float val_tmp2 = val_tmp + val_a; + float val_inter = val_tmp2 + val_b; + float val_tmp_2 = val_inter + val_b; + float val_tmp2_2 = val_tmp_2 + val_a; + float val_out = val_tmp2_2 + val_b; + + fill_vtensor(graph, a, val_a); + fill_vtensor(graph, b, val_b); + + graph.execute(); + + EXTRACT_TENSOR(out); + + // Sanity check that the values are correct + for (size_t i = 0; i < graph.get_tensor(out.value)->numel(); ++i) { + CHECK_VALUE(data_out, i, val_out); + } + } +} + TEST(VulkanComputeGraphTest, test_large_graph) { auto build_start_time = std::chrono::system_clock::now(); GraphConfig config; @@ -2050,9 +2274,9 @@ void run_from_gpu_test( context()->submit_compute_job( VK_KERNEL_FROM_STR(kernel_name), pipeline_barrier, - vten.image_extents(), + vten.logical_limits(), {4, 4, 4}, - {vten.packed_dim_whcn_idx(), offset}, + {vten.packed_dim(), offset}, VK_NULL_HANDLE, 0, vten.image( @@ -2062,7 +2286,7 @@ void run_from_gpu_test( vten.sizes_ubo()); } - StorageBuffer staging_buffer(context(), dtype, vten.staging_buffer_numel()); + StagingBuffer staging_buffer(context(), dtype, vten.staging_buffer_numel()); if (dtype == vkapi::kChar && !context()->adapter_ptr()->has_full_int8_buffers_support()) { @@ -2074,7 +2298,7 @@ void run_from_gpu_test( submit_to_gpu(); std::vector data_out(staging_buffer.numel()); - copy_staging_to_ptr(staging_buffer, data_out.data(), staging_buffer.nbytes()); + staging_buffer.copy_to(data_out.data(), staging_buffer.nbytes()); for (int i = 0; i < vten.numel(); i++) { CHECK_VALUE(data_out, i, i + offset); @@ -2095,18 +2319,17 @@ void round_trip_test( vTensor vten = vTensor(context(), sizes, dtype, storage_type, memory_layout); // Create and fill input staging buffer - StorageBuffer staging_buffer_in( + StagingBuffer staging_buffer_in( context(), dtype, vten.staging_buffer_numel()); std::vector data_in(staging_buffer_in.numel()); for (int i = 0; i < staging_buffer_in.numel(); i++) { data_in[i] = T(i * -1); } - copy_ptr_to_staging( - data_in.data(), staging_buffer_in, vten.staging_buffer_nbytes()); + staging_buffer_in.copy_from(data_in.data(), vten.staging_buffer_nbytes()); // Output staging buffer - StorageBuffer staging_buffer_out( + StagingBuffer staging_buffer_out( context(), dtype, vten.staging_buffer_numel()); record_nchw_to_image_op(context(), staging_buffer_in.buffer(), vten); @@ -2124,8 +2347,7 @@ void round_trip_test( // Extract data from output staging buffer std::vector data_out(staging_buffer_out.numel()); - copy_staging_to_ptr( - staging_buffer_out, data_out.data(), staging_buffer_out.nbytes()); + staging_buffer_out.copy_to(data_out.data(), staging_buffer_out.nbytes()); // All indices should be equal to the input data for (int i = 0; i < vten.numel(); i++) { @@ -2231,7 +2453,7 @@ TEST(VulkanToFromGPUShaderTest, round_trip_tests) { for (auto& sizes : to_test) { RUN_TESTS(float, vkapi::kFloat) - RUN_TESTS(torch::executor::Half, vkapi::kHalf) + RUN_TESTS(exec_aten::Half, vkapi::kHalf) } for (auto& sizes : to_test_int8) { @@ -2451,6 +2673,7 @@ TEST(VulkanComputeGraphOpsTest, mm_smoke_test) { prepack); CALL_TEST_FN_FOR_W_PACKED(RUN_TESTS); + CALL_TEST_FN_FOR_C_PACKED(RUN_TESTS); #undef RUN_TESTS } @@ -2559,19 +2782,18 @@ void test_conv2d( // Create and fill input staging buffer const int64_t in_numel = utils::multiply_integers(original_sizes); - StorageBuffer staging_buffer_in(context(), vkapi::kFloat, in_numel); + StagingBuffer staging_buffer_in(context(), vkapi::kFloat, in_numel); std::vector data_in(in_numel); for (int i = 0; i < in_numel; i++) { data_in[i] = i + 1; } - copy_ptr_to_staging( - data_in.data(), staging_buffer_in, sizeof(float) * in_numel); + staging_buffer_in.copy_from(data_in.data(), sizeof(float) * in_numel); // Output staging buffer const int64_t out_numel = padded_sizes[0] * padded_sizes[1] * original_sizes[2] * original_sizes[3]; - StorageBuffer staging_buffer_out(context(), vkapi::kFloat, out_numel); + StagingBuffer staging_buffer_out(context(), vkapi::kFloat, out_numel); // Copy data in and out of the tensor record_conv2d_prepack_weights_op( @@ -2583,8 +2805,7 @@ void test_conv2d( // Extract data from output staging buffer std::vector data_out(out_numel); - copy_staging_to_ptr( - staging_buffer_out, data_out.data(), sizeof(float) * out_numel); + staging_buffer_out.copy_to(data_out.data(), sizeof(float) * out_numel); // Check data matches results copied from ATen-VK for (int i = 0; i < vten.numel(); i++) { @@ -2683,13 +2904,150 @@ TEST(VulkanComputeGraphOpsTest, grid_priors_test) { /*data_out_expected = */ {4, 4, 12, 4, 20, 4, 4, 12, 12, 12, 20, 12}); } +void test_int4pack_mm( + std::vector MKN, + uint32_t group_size, + utils::StorageType storage_type) { + GraphConfig config; + ComputeGraph graph(config); + + const uint32_t M = MKN[0]; + const uint32_t K = MKN[1]; + const uint32_t N = MKN[2]; + + const std::vector mat1_size = {M, K}; + const std::vector mat2_size = {K, N}; + const std::vector mat2_q_size = {N, K / 2}; // Transposed and packed + const std::vector out_size = {M, N}; + + std::vector A_data = create_random_float_buffer(M * K); + IOValueRef A = graph.add_input_tensor(mat1_size, vkapi::kFloat, storage_type); + graph.copy_into_staging(A.staging, A_data.data(), A_data.size()); + + // Quantized but un-packed weights + std::vector B_quant_data = create_random_uint8_buffer(K * N, 0, 16); + + // Pack and transpose weights to correspond to int4 weight format + std::vector B_int4_data = + int4mm_pack_weights(mat2_size, B_quant_data.data()); + + IOValueRef B_int4 = + graph.add_input_tensor(mat2_q_size, vkapi::kQInt8, utils::kBuffer); + graph.copy_into_staging( + B_int4.staging, B_int4_data.data(), B_int4_data.size()); + + const int k_groups = K / group_size; + + // Random scales and zeroes. Keep scales small to avoid overflow and zeroes in + // int4 range + IOValueRef scales_and_zeros; + + if (storage_type == utils::kBuffer) { + scales_and_zeros.value = graph.add_tensor( + {2, N, k_groups}, vkapi::kFloat, storage_type, utils::kWidthPacked); + } else { + scales_and_zeros.value = graph.add_tensor( + {2, N, k_groups}, vkapi::kFloat, storage_type, utils::kChannelsPacked); + } + + scales_and_zeros.staging = graph.set_input_tensor(scales_and_zeros.value); + + std::vector s_data(graph.numel_of(scales_and_zeros.value)); + const int zeros_stride = s_data.size() / 2; + for (size_t i = 0; i < zeros_stride; i++) { + s_data[i] = rand() % 100; + s_data[i + zeros_stride] = rand() % 16; + } + + graph.copy_into_staging( + scales_and_zeros.staging, s_data.data(), s_data.size()); + + IOValueRef out_int4; + + if (storage_type == utils::kBuffer) { + out_int4.value = graph.add_tensor(out_size, vkapi::kFloat, utils::kBuffer); + } else { + out_int4.value = + graph.add_tensor(out_size, vkapi::kFloat, utils::kChannelsPacked); + } + + VK_GET_OP_FN("aten._weight_int4pack_mm.default") + (graph, + {A.value, + B_int4.value, + graph.add_scalar(group_size), + scales_and_zeros.value, + out_int4.value}); + + out_int4.staging = graph.set_output_tensor(out_int4.value); + + // Dequantized matmul for comparison + IOValueRef B_deq = + graph.add_input_tensor(mat2_size, vkapi::kFloat, storage_type); + std::vector B_deq_data = int4mm_dequantize_weights( + mat2_size, B_quant_data.data(), group_size, s_data.data()); + graph.copy_into_staging(B_deq.staging, B_deq_data.data(), B_deq_data.size()); + + IOValueRef out_deq; + out_deq.value = graph.add_tensor(out_size, vkapi::kFloat, storage_type); + + VK_GET_OP_FN("aten.mm.default") + (graph, {A.value, B_deq.value, out_deq.value}); + + out_deq.staging = graph.set_output_tensor(out_deq.value); + + graph.prepare(); + graph.encode_prepack(); + graph.prepack(); + graph.encode_execute(); + graph.propagate_resize(); + graph.execute(); + + // Compare outputs + std::vector out_int4_data(graph.numel_of(out_int4.value)); + graph.copy_from_staging( + out_int4.staging, out_int4_data.data(), out_int4_data.size()); + + std::vector out_deq_data(graph.numel_of(out_deq.value)); + graph.copy_from_staging( + out_deq.staging, out_deq_data.data(), out_deq_data.size()); + + for (int i = 0; i < out_int4_data.size(); i++) { + EXPECT_TRUE(check_close(out_int4_data[i], out_deq_data[i])); + } +} + +TEST(VulkanComputeGraphOpsTest, int4pack_mm_test) { + if (!context()->adapter_ptr()->has_full_int8_buffers_support()) { + GTEST_SKIP(); + } + + for (auto storage_type : {utils::kBuffer, utils::kTexture3D}) { + // Vector multiplication, single group per row + test_int4pack_mm({1, 32, 1}, 32, storage_type); + + // Vector multiplication, multiple groups per row + test_int4pack_mm({1, 256, 1}, 64, storage_type); + + // Square matrices, single group per row + test_int4pack_mm({32, 32, 32}, 32, storage_type); + + // Irregular matrices, single group per row + test_int4pack_mm({37, 32, 19}, 32, storage_type); + + // Irregular matrices, multiple groups per row + test_int4pack_mm({37, 256, 19}, 64, storage_type); + } +} + void test_transpose_view_mm( const int B, const int M, const int K, - const int N) { + const int N, + utils::StorageType storage_type) { GraphConfig config; - config.set_storage_type_override(utils::kBuffer); + config.set_storage_type_override(storage_type); ComputeGraph graph(config); std::vector mat1_size = {M, K}; @@ -2717,10 +3075,10 @@ void test_transpose_view_mm( IOValueRef mat1 = graph.add_input_tensor(mat1_size, vkapi::kFloat, utils::kWidthPacked); - IOValueRef mat2_t = + IOValueRef mat2_transpose = graph.add_input_tensor(mat2_t_size, vkapi::kFloat, utils::kWidthPacked); - ValueRef mat2 = graph.add_tensor_view(mat2_t.value); + ValueRef mat2 = graph.add_tensor_view(mat2_transpose.value); ValueRef dim0; ValueRef dim1; @@ -2736,7 +3094,8 @@ void test_transpose_view_mm( IOValueRef out; out.value = graph.add_tensor(out_size, vkapi::kFloat, utils::kWidthPacked); - VK_GET_OP_FN("aten.transpose.int")(graph, {mat2_t.value, dim0, dim1, mat2}); + VK_GET_OP_FN("aten.transpose.int") + (graph, {mat2_transpose.value, dim0, dim1, mat2}); VK_GET_OP_FN("aten.mm.default")(graph, {mat1.value, mat2, out.value}); out.staging = graph.set_output_tensor(out.value); @@ -2767,5 +3126,7 @@ void test_transpose_view_mm( } TEST(VulkanComputeGraphOpsTest, test_transpose_with_mm) { - test_transpose_view_mm(2, 7, 17, 5); + for (auto storage_type : {utils::kBuffer, utils::kTexture3D}) { + test_transpose_view_mm(2, 7, 17, 5, storage_type); + } } diff --git a/backends/vulkan/tools/gpuinfo/include/architecture.h b/backends/vulkan/tools/gpuinfo/include/architecture.h index 0d312ee87c3..9af908eb170 100644 --- a/backends/vulkan/tools/gpuinfo/include/architecture.h +++ b/backends/vulkan/tools/gpuinfo/include/architecture.h @@ -40,7 +40,7 @@ void reg_count(const App& app) { uint32_t NITER; auto bench = [&](uint32_t ngrp, uint32_t nreg) { - StorageBuffer buffer(context(), vkapi::kFloat, 1); + StagingBuffer buffer(context(), vkapi::kFloat, 1); vkapi::PipelineBarrier pipeline_barrier{}; auto shader_name = "reg_count_" + std::to_string(nreg); @@ -164,7 +164,7 @@ void warp_size(const App& app, const bool verbose = false) { uint32_t NITER; auto bench = [&](uint32_t nthread) { - StorageBuffer out_buf(context(), vkapi::kInt, app.nthread_logic); + StagingBuffer out_buf(context(), vkapi::kInt, app.nthread_logic); vkapi::PipelineBarrier pipeline_barrier{}; auto shader_name = "warp_size_physical"; @@ -224,7 +224,7 @@ void warp_size(const App& app, const bool verbose = false) { // doesn't depend on kernel timing, so the extra wait time doesn't lead to // inaccuracy. auto bench_sm = [&](uint32_t nthread) { - StorageBuffer out_buf(context(), vkapi::kInt, app.nthread_logic); + StagingBuffer out_buf(context(), vkapi::kInt, app.nthread_logic); vkapi::PipelineBarrier pipeline_barrier{}; auto shader_name = "warp_size_scheduler"; @@ -242,7 +242,7 @@ void warp_size(const App& app, const bool verbose = false) { }); std::vector data(app.nthread_logic); - copy_staging_to_ptr(out_buf, data.data(), out_buf.nbytes()); + out_buf.copy_to(data.data(), out_buf.nbytes()); if (verbose) { std::stringstream ss; diff --git a/backends/vulkan/tools/gpuinfo/include/buffers.h b/backends/vulkan/tools/gpuinfo/include/buffers.h index c8cf93c4a12..31137b11eea 100644 --- a/backends/vulkan/tools/gpuinfo/include/buffers.h +++ b/backends/vulkan/tools/gpuinfo/include/buffers.h @@ -35,8 +35,8 @@ void buf_cacheline_size(const App& app) { uint32_t NITER; auto bench = [&](int stride) { - StorageBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE); - StorageBuffer out_buf(context(), vkapi::kFloat, 1); + StagingBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE); + StagingBuffer out_buf(context(), vkapi::kFloat, 1); vkapi::PipelineBarrier pipeline_barrier{}; auto shader_name = "buf_cacheline_size"; @@ -132,8 +132,8 @@ void _bandwidth( // workgroups, once the size of the access excedes the workgroup width. const uint32_t workgroup_width = local_x * NITER * NUNROLL; - StorageBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float)); - StorageBuffer out_buf( + StagingBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float)); + StagingBuffer out_buf( context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic); vkapi::PipelineBarrier pipeline_barrier{}; diff --git a/backends/vulkan/tools/gpuinfo/include/textures.h b/backends/vulkan/tools/gpuinfo/include/textures.h index 7679f11b0ca..c9ff133f1ec 100644 --- a/backends/vulkan/tools/gpuinfo/include/textures.h +++ b/backends/vulkan/tools/gpuinfo/include/textures.h @@ -61,7 +61,7 @@ void tex_cacheline_concurr(const App& app) { vTensor in_tensor = api::vTensor(api::context(), sizes_nchw, vkapi::kFloat); - StorageBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH); + StagingBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH); vkapi::PipelineBarrier pipeline_barrier{}; @@ -173,7 +173,7 @@ void tex_bandwidth(const App& app) { // workgroups, once the size of the access excedes the workgroup width. const uint32_t workgroup_width = local_x * NITER * NUNROLL; - StorageBuffer out_buf( + StagingBuffer out_buf( context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic); vkapi::PipelineBarrier pipeline_barrier{}; diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py index 1865c32acd7..7e85c25faee 100644 --- a/backends/vulkan/vulkan_preprocess.py +++ b/backends/vulkan/vulkan_preprocess.py @@ -57,7 +57,7 @@ def preprocess( # noqa: C901 MeanToSumDiv(), SpecPropPass(), ConstraintBasedSymShapeEvalPass(), - MemoryPlanningPass("greedy"), + MemoryPlanningPass(), ] new_gm = program.graph_module diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt index a5b12d65799..c22f029c263 100644 --- a/backends/xnnpack/CMakeLists.txt +++ b/backends/xnnpack/CMakeLists.txt @@ -32,9 +32,11 @@ if(NOT PYTHON_EXECUTABLE) resolve_python_executable() endif() -# NB: Enabling this will serialize execution of delegate instances -# Keeping this OFF by default to maintain existing behavior, to be revisited. -option(EXECUTORCH_XNNPACK_SHARED_WORKSPACE "Enable workspace sharing across different delegate instances" OFF) +# NB: Enabling this will serialize execution of delegate instances. +# This setting may have performance implications. +option(EXECUTORCH_XNNPACK_SHARED_WORKSPACE + "Enable workspace sharing across different delegate instances" ON +) if(EXECUTORCH_XNNPACK_SHARED_WORKSPACE) add_definitions(-DENABLE_XNNPACK_SHARED_WORKSPACE) endif() diff --git a/backends/xnnpack/README.md b/backends/xnnpack/README.md index 33a0bfaf309..0c3d7e14428 100644 --- a/backends/xnnpack/README.md +++ b/backends/xnnpack/README.md @@ -105,9 +105,10 @@ mkdir cmake-out cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_ENABLE_LOGGING=ON \ -DPYTHON_EXECUTABLE=python \ -Bcmake-out . diff --git a/backends/xnnpack/cmake/Dependencies.cmake b/backends/xnnpack/cmake/Dependencies.cmake index 40e4e72c38b..b76c54bee60 100644 --- a/backends/xnnpack/cmake/Dependencies.cmake +++ b/backends/xnnpack/cmake/Dependencies.cmake @@ -36,6 +36,10 @@ set(XNNPACK_ENABLE_AVXVNNI OFF CACHE BOOL "" ) +set(XNNPACK_ENABLE_KLEIDIAI + OFF + CACHE BOOL "" +) add_subdirectory("${XNNPACK_SOURCE_DIR}") include_directories(SYSTEM ${XNNPACK_INCLUDE_DIR}) list(APPEND xnnpack_third_party XNNPACK) diff --git a/backends/xnnpack/operators/op_dynamic_dequantize_ops.py b/backends/xnnpack/operators/op_dynamic_dequantize_ops.py index d47f9f479e4..f8f0c54ee68 100644 --- a/backends/xnnpack/operators/op_dynamic_dequantize_ops.py +++ b/backends/xnnpack/operators/op_dynamic_dequantize_ops.py @@ -12,7 +12,15 @@ register_node_visitor, ) from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import XNNGraph -from executorch.backends.xnnpack.utils.utils import get_input_node +from executorch.backends.xnnpack.utils.quant_utils import ( + is_per_channel_group, + is_per_token, +) +from executorch.backends.xnnpack.utils.utils import ( + check_or_raise, + get_input_node, + is_param_node, +) @register_node_visitor @@ -65,3 +73,40 @@ def define_node( dq_input = get_input_node(node, 0) if dq_input in vals_to_ids: vals_to_ids[node] = vals_to_ids[dq_input] + + +@register_node_visitor +class OpDequantizeAffine(NodeVisitor): + target = "quant.dequantize_affine.default" + + def __init__(self, *args) -> None: + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + xnn_graph: XNNGraph, + vals_to_ids: Dict[torch.fx.Node, int], + debug_handle: int, + ) -> None: + """ + We always define dequantize affine nodes because they are always explicit + """ + if is_per_channel_group(node): + check_or_raise( + is_param_node(self._exported_program, node.all_input_nodes[0]), + f"Expected quantize affine node with per-token semantics to be used " + f"in front of a weight node, but found node {node.all_input_nodes[0]}", + ) + # Affine dequantize was recognized as per channel group which means that it should + # be skipped as this means it is used in front of a weight node + return + + check_or_raise( + is_per_token(node), + "Expecting Affine Dequantized Op to have per-token semantics", + ) + # This must be a per-token affine dequantized node, so let us serialize as such + dq_input = get_input_node(node, 0) + if dq_input in vals_to_ids: + vals_to_ids[node] = vals_to_ids[dq_input] diff --git a/backends/xnnpack/operators/op_dynamic_quantize_ops.py b/backends/xnnpack/operators/op_dynamic_quantize_ops.py index bf5f3b7b092..23047e731f7 100644 --- a/backends/xnnpack/operators/op_dynamic_quantize_ops.py +++ b/backends/xnnpack/operators/op_dynamic_quantize_ops.py @@ -17,6 +17,10 @@ XNNGraph, XNode, ) +from executorch.backends.xnnpack.utils.quant_utils import ( + is_per_channel_group, + is_per_token, +) from executorch.backends.xnnpack.utils.utils import check_or_raise, get_input_node @@ -118,3 +122,56 @@ def define_node( debug_handle=debug_handle, ) xnn_graph.xnodes.append(ser_node) + + +@register_node_visitor +class OpQuantizeAffine(NodeVisitor): + target = "quant.quantize_affine.default" + + def define_node( + self, + node: torch.fx.Node, + xnn_graph: XNNGraph, + vals_to_ids: Dict[torch.fx.Node, int], + debug_handle: int, + ) -> None: + """ + We always define quantize affine nodes because they are always explicit + """ + if is_per_channel_group(node): + # Affine quantized was recognized as per channel group which means that it should + # be skipped as this means it is used in front of a weight node + return + + check_or_raise( + is_per_token(node), + "Encountered affine quantized op which does not have per-token semantics", + ) + # Treat this node as dynamic per-token quantization + q_input = get_input_node(node, 0) + + # fp32 input + self.define_tensor(q_input, xnn_graph, vals_to_ids) + input_id = vals_to_ids[q_input] + + # dynamic quantized output + input_quant_params = QuantParams.from_q_dq_node(node) + # qinput isn't needed for dynamically quantized nodes since it will always be + # the output of a convert node. Instead we set q_input to the node itself so + # we can extract the shape from the dq output + input_quant_params.q_input = node + input_quant_params.is_input = False + check_or_raise( + input_quant_params.is_dynamic, + "Internal Error, dynamically quantized node expected dynamic quantized params", + ) + self.define_tensor( + node, xnn_graph, vals_to_ids, quant_params=input_quant_params + ) + output_id = vals_to_ids[node] + + ser_node = XNode( + xnode_union=XNNConvert(input_id=input_id, output_id=output_id, flags=0), + debug_handle=debug_handle, + ) + xnn_graph.xnodes.append(ser_node) diff --git a/backends/xnnpack/operators/op_skip_ops.py b/backends/xnnpack/operators/op_skip_ops.py index d6a54c901eb..6597c0568e3 100644 --- a/backends/xnnpack/operators/op_skip_ops.py +++ b/backends/xnnpack/operators/op_skip_ops.py @@ -97,6 +97,15 @@ class OpSymSizeInt(OpSkipOps): target = "sym_size.int" +@register_node_visitor +class OpChooseQparamsAffine(OpSkipOps): + """ + do nothing if node is choose_qparams_affine.default + """ + + target = "quant.choose_qparams_affine.default" + + @register_node_visitor class OpChooseQparamsToken(OpSkipOps): """ diff --git a/backends/xnnpack/operators/quant_params.py b/backends/xnnpack/operators/quant_params.py index d60c300276f..44908ac7fca 100644 --- a/backends/xnnpack/operators/quant_params.py +++ b/backends/xnnpack/operators/quant_params.py @@ -10,7 +10,15 @@ import torch from executorch.backends.xnnpack.passes.tag_implicit_q_dq_pass import TagImplicitQDqPass -from executorch.backends.xnnpack.utils.quant_utils import is_dequant, is_quant +from executorch.backends.xnnpack.utils.quant_utils import ( + extract_qdq_affine_op_args_for_decomposed_ops, + is_affine_qdq, + is_dequant, + is_dynamic_qdq, + is_per_channel, + is_per_channel_group, + is_quant, +) from executorch.backends.xnnpack.utils.utils import ( check_or_raise, get_param_tensor, @@ -154,30 +162,18 @@ def from_q_dq_node( q_input = quant_node.all_input_nodes[0] # TODO: Use presence of choose_qparam node to determine if this is a dynamic quantization - if quant_node.target in [ - exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor, - exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor, - exir_ops.edge.quantized_decomposed.quantize_per_token.default, - exir_ops.edge.quantized_decomposed.dequantize_per_token.default, - ]: + if is_dynamic_qdq(quant_node): return cls._from_dynamic_input_node(quant_node) - per_channel = quant_node.target in [ - exir_ops.edge.quantized_decomposed.quantize_per_channel.default, - exir_ops.edge.quantized_decomposed.dequantize_per_channel.default, - ] - - _groupwise = False - if quant_node.target in [ - exir_ops.edge.quantized_decomposed.quantize_per_channel_group.default, - exir_ops.edge.quantized_decomposed.dequantize_per_channel_group.default, - ]: - # This is a sub-category of per channel quantization - per_channel = True - _groupwise = True - - scale = quant_node.args[1] - zp = quant_node.args[2] + per_channel = is_per_channel(quant_node) + + _groupwise = is_per_channel_group(quant_node) + quant_node_args = quant_node.args + if _groupwise and is_affine_qdq(quant_node): + quant_node_args = extract_qdq_affine_op_args_for_decomposed_ops(quant_node) + + scale = quant_node_args[1] + zp = quant_node_args[2] axis = 0 if per_channel: assert isinstance(scale, torch.fx.Node) and isinstance(scale.target, str) @@ -193,10 +189,15 @@ def _get_tensor(node): scale = _get_tensor(scale) zp = _get_tensor(zp) - axis = cast(int, quant_node.args[3]) + axis = cast(int, quant_node_args[3]) if _groupwise: scale_tensor = cast(torch.Tensor, scale) + if scale_tensor.ndim == 1: + scale_tensor = scale_tensor.reshape(-1, 1) + zp = zp.reshape(-1, 1) + scale = scale_tensor + assert ( scale_tensor.ndim == 2 ), "Weight scale must be 2D for per_channel_group [de]quant node, got {scale.ndim}D" @@ -204,23 +205,23 @@ def _get_tensor(node): check_or_raise( bool( - quant_node.args[-1] != torch.uint8 - or quant_node.args[-1] != torch.quint8 + quant_node_args[-1] != torch.uint8 + or quant_node_args[-1] != torch.quint8 ), "XNNPACK does not support unsigned quantization", ) if _groupwise: - _ = quant_node.args[-1] # output dtype - not used - group_size = cast(int, quant_node.args[-2]) - dtype = cast(torch.dtype, quant_node.args[-3]) - qmax = cast(int, quant_node.args[-4]) - qmin = cast(int, quant_node.args[-5]) + _ = quant_node_args[-1] # output dtype - not used + group_size = cast(int, quant_node_args[-2]) + dtype = cast(torch.dtype, quant_node_args[-3]) + qmax = cast(int, quant_node_args[-4]) + qmin = cast(int, quant_node_args[-5]) else: group_size = 0 - dtype = cast(torch.dtype, quant_node.args[-1]) - qmax = cast(int, quant_node.args[-2]) - qmin = cast(int, quant_node.args[-3]) + dtype = cast(torch.dtype, quant_node_args[-1]) + qmax = cast(int, quant_node_args[-2]) + qmin = cast(int, quant_node_args[-3]) is_output = any( user_node.op == "output" for user_node in quant_node.users.keys() @@ -244,26 +245,14 @@ def _get_tensor(node): def from_weights( cls, tensor_node: torch.fx.Node, ep: Optional[ExportedProgram] = None ) -> Optional[QuantParams]: - # Ignore transpose for weights - # TODO:T148540997 remove the t_copy/permute_copy check when convert addmm to linear - dq = ( - tensor_node.all_input_nodes[0] - if tensor_node.target - in ( - exir_ops.edge.aten.permute_copy.default, - exir_ops.edge.aten.t_copy.default, - ) - else tensor_node - ) - # check input of t_copy/permute_copy is dequant - if not is_dequant(dq): + if not is_dequant(tensor_node): return None # source node for quant params - src = dq + src = tensor_node # is input of dq is q? - dq_input = dq.all_input_nodes[0] + dq_input = src.all_input_nodes[0] if is_quant(dq_input): src = dq_input diff --git a/backends/xnnpack/partition/config/__init__.py b/backends/xnnpack/partition/config/__init__.py index 141ccf9802e..ed105dc1f53 100644 --- a/backends/xnnpack/partition/config/__init__.py +++ b/backends/xnnpack/partition/config/__init__.py @@ -53,6 +53,11 @@ MaxDimConfig, PreluConfig, ) +from executorch.backends.xnnpack.partition.config.quant_affine_configs import ( + ChooseQParamsAffineConfig, + DeQuantizeAffineConfig, + QuantizeAffineConfig, +) from executorch.backends.xnnpack.partition.config.xnnpack_config import ( XNNPartitionerConfig, ) @@ -98,4 +103,8 @@ # Quant/Dequant Op Configs QuantizedPerTensorConfig, DeQuantizedPerTensorConfig, + # Quant Affine Configs to preserve decomp + QuantizeAffineConfig, + DeQuantizeAffineConfig, + ChooseQParamsAffineConfig, ] diff --git a/backends/xnnpack/partition/config/gemm_configs.py b/backends/xnnpack/partition/config/gemm_configs.py index 3c4d446a6b4..cbcb14899d4 100644 --- a/backends/xnnpack/partition/config/gemm_configs.py +++ b/backends/xnnpack/partition/config/gemm_configs.py @@ -4,6 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import logging from itertools import chain from typing import cast, List, Optional, Tuple @@ -13,9 +14,12 @@ XNNPartitionerConfig, ) from executorch.backends.xnnpack.utils.quant_utils import ( + extract_qdq_affine_op_args_for_decomposed_ops, + is_affine_qdq, is_dequant, is_dynamic_qdq, is_per_channel, + is_per_channel_group, is_qparam, is_quant, ) @@ -28,12 +32,16 @@ from executorch.exir.backend.canonical_partitioners.config_partitioner import ( format_target_name, ) +from executorch.exir.backend.utils import WhyNoPartition from torch.export import ExportedProgram from torch.fx.passes.utils.source_matcher_utils import ( get_source_partitions, SourcePartition, ) +logger = logging.getLogger(__name__) +why = WhyNoPartition(logger=logger) + class GEMMConfig(XNNPartitionerConfig): """ @@ -44,8 +52,8 @@ class GEMMConfig(XNNPartitionerConfig): different ops """ - def __init__(self, weight_idx, bias_idx, act_idx, fused_acts): - super().__init__() + def __init__(self, weight_idx, bias_idx, act_idx, fused_acts, **kwargs): + super().__init__(**kwargs) self.weight_idx = weight_idx self.bias_idx = bias_idx self.act_idx = act_idx @@ -57,6 +65,8 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: return False is_valid, _ = self.get_deps(node, ep) + if not is_valid: + why(node, "Failed to get valid dependent nodes.") return is_valid def get_node_and_deps( @@ -131,7 +141,7 @@ def _get_weight_deps( return False, [] gemm_deps.append(weight) - if is_per_channel(dequant_node): + if is_per_channel(dequant_node) or is_per_channel_group(dequant_node): if len(dequant_node.all_input_nodes) < 2: # Expected channel quantized to have scale/zp nodes return False, [] @@ -214,12 +224,15 @@ def _get_act_deps( return (False, []) gemm_deps.append(q_input) - if not (is_node(q_input.args[1]) and is_node(q_input.args[2])): + q_input_args = q_input.args + if is_affine_qdq(q_input): + q_input_args = extract_qdq_affine_op_args_for_decomposed_ops(q_input) + if not (is_node(q_input_args[1]) and is_node(q_input_args[2])): # expected to find getitem node from choose qparam return (False, []) - getitem1 = get_input_node(q_input, 1) - getitem2 = get_input_node(q_input, 2) + getitem1 = q_input_args[1] + getitem2 = q_input_args[2] if not (is_getitem(getitem1) and is_getitem(getitem2)): # expected getitem node from choose qparam @@ -237,17 +250,28 @@ def _get_act_deps( class LinearConfig(GEMMConfig): target_name = "linear.default" - def __init__(self): + def __init__(self, **kwargs): super().__init__( weight_idx=1, bias_idx=2, act_idx=0, fused_acts=["relu.default", "hardtanh.default"], + **kwargs, ) def get_original_aten(self) -> Optional[torch._ops.OpOverload]: return torch.ops.aten.linear.default + def _get_weight_deps( + self, node: torch.fx.Node, ep: ExportedProgram, precision: ConfigPrecisionType + ) -> Tuple[bool, List[torch.fx.Node]]: + if precision == ConfigPrecisionType.FP32 and self.force_fp32_dynamic_linear: + # if force fp32_dynamic_linear is on and we detected this as fp32, then we + # do not partition the weight node + return (True, []) + + return super()._get_weight_deps(node, ep, precision) + def supported_precision_types(self): return [ ConfigPrecisionType.DYNAMIC_QUANT, @@ -259,12 +283,13 @@ def supported_precision_types(self): class ConvolutionConfig(GEMMConfig): target_name = "convolution.default" - def __init__(self): + def __init__(self, **kwargs): super().__init__( weight_idx=1, bias_idx=2, act_idx=0, fused_acts=["relu.default", "hardtanh.default"], + **kwargs, ) def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: @@ -276,10 +301,12 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: conv_stride = cast(List[int], node.args[3]) if len(conv_stride) > 2: + why(node, "Only support 1D + 2D Conv") return False # Only support 1D + 2D Conv transposed = cast(bool, node.args[6]) if transposed: + why(node, "Transposed Conv is not supported") return False # Currently don't support transposed conv return True @@ -299,12 +326,13 @@ class AddmmConfig(GEMMConfig): target_name = "addmm.default" - def __init__(self): + def __init__(self, **kwargs): super().__init__( weight_idx=2, bias_idx=0, act_idx=1, fused_acts=["relu.default", "hardtanh.default"], + **kwargs, ) self.src_partitions = None self.linear_modules = [torch.nn.functional.linear, torch.nn.Linear] @@ -402,8 +430,8 @@ def supported_precision_types(self): class MMConfig(AddmmConfig): target_name = "mm.default" - def __init__(self): - super().__init__() + def __init__(self, **kwargs): + super().__init__(**kwargs) self.bias_idx = None self.weight_idx = 1 self.act_idx = 0 diff --git a/backends/xnnpack/partition/config/generic_node_configs.py b/backends/xnnpack/partition/config/generic_node_configs.py index e309a3bd038..b95d7c5b89c 100644 --- a/backends/xnnpack/partition/config/generic_node_configs.py +++ b/backends/xnnpack/partition/config/generic_node_configs.py @@ -4,6 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import logging from typing import cast, List, Optional import torch @@ -16,17 +17,21 @@ from executorch.exir.backend.canonical_partitioners.config_partitioner import ( format_target_name, ) +from executorch.exir.backend.utils import WhyNoPartition from torch.export import ExportedProgram +logger = logging.getLogger(__name__) +why = WhyNoPartition(logger=logger) + class GenericNodePartitionerConfig(XNNPartitionerConfig): - def __init__(self, fused_act: Optional[List[str]] = None): + def __init__(self, fused_act: Optional[List[str]] = None, **kwargs): """ fused_act is a list of node target names that can be fused with this node under quantization """ self.fused_acts = fused_act or [] - super().__init__() + super().__init__(**kwargs) def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: return self.check_common_constraints(node, ep) @@ -93,8 +98,8 @@ def supported_precision_types(self) -> List[ConfigPrecisionType]: class AddConfig(GenericNodePartitionerConfig): target_name = "add.Tensor" - def __init__(self): - super().__init__(fused_act=["relu.default"]) + def __init__(self, **kwargs): + super().__init__(fused_act=["relu.default"], **kwargs) def supported_precision_types(self) -> List[ConfigPrecisionType]: return [ConfigPrecisionType.FP32, ConfigPrecisionType.STATIC_QUANT] @@ -141,9 +146,22 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: if len(args) >= 7: divisor_override = cast(int, args[6]) - return ( - not (ceil_mode or count_include_pad) and divisor_override == pooling_region - ) + if ceil_mode: + why(node, reason="ceil mode is not supported") + return False + + if count_include_pad: + why( + node, + reason="zero-padding in the averaging calculation is not supported", + ) + return False + + if divisor_override != pooling_region: + why(node, reason="divisor override is not supported") + return False + + return True def supported_precision_types(self) -> List[ConfigPrecisionType]: return [ConfigPrecisionType.FP32] @@ -160,7 +178,15 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: return False num_tensors = len(node.all_input_nodes) - return num_tensors >= 2 and num_tensors <= 4 + + if not (num_tensors >= 2 and num_tensors <= 4): + why( + node, + reason=f"only support concatenation of 2 - 4 tensors, got {num_tensors} tensors", + ) + return False + + return True def supported_precision_types(self) -> List[ConfigPrecisionType]: return [ConfigPrecisionType.FP32, ConfigPrecisionType.STATIC_QUANT] @@ -210,7 +236,14 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: dim = cast(int, node.args[1]) node_input = node.all_input_nodes[0] tensor_dims = node_input.meta["val"].dim() - return dim == -1 or dim == tensor_dims - 1 + + if not (dim == -1 or dim == tensor_dims - 1): + why( + node, + reason=f"dim must be the last dim, got dim = {dim} for tensor of rank {tensor_dims}", + ) + return False + return True def supported_precision_types(self) -> List[ConfigPrecisionType]: return [ConfigPrecisionType.FP32] @@ -255,7 +288,10 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: return False is_ceil_mode = len(node.args) >= 6 and cast(bool, node.args[5]) - return not is_ceil_mode + if is_ceil_mode: + why(node, reason="ceil mode is not supported") + return False + return True def supported_precision_types(self) -> List[ConfigPrecisionType]: return [ConfigPrecisionType.FP32, ConfigPrecisionType.STATIC_QUANT] @@ -309,7 +345,20 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: dims = node.args[1] output_dims = node.meta["val"].dim() - return dims in ([-2, -1], [-1, -2]) and output_dims == 4 + if dims not in ([-2, -1], [-1, -2]): + why( + node, + reason="mean.dim only supports averaging 4D tensors across the innermost dimensions", + ) + return False + + if output_dims != 4: + why( + node, + reason=f"mean.dim only supports averaging 4D tensors, got tensor of rank {output_dims}", + ) + return False + return True def supported_precision_types(self) -> List[ConfigPrecisionType]: return [ConfigPrecisionType.FP32, ConfigPrecisionType.STATIC_QUANT] @@ -340,7 +389,15 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: return False power = node.args[1] - return isinstance(power, int) and power == 2 + + if not isinstance(power, int): + why(node, reason=f"only support int powers, got {power}") + return False + + if power != 2: + why(node, reason=f"only support power == 2, got {power}") + return False + return True def supported_precision_types(self) -> List[ConfigPrecisionType]: return [ConfigPrecisionType.FP32] @@ -372,10 +429,18 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: for dim in input_shape: if not isinstance(dim, int) or dim == 0: + why( + node, + reason=f"input tensor has invalid shape, dim: {dim} of type {type(dim)}. Expecting non-zero, int values.", + ) return False for dim in output_shape: if not isinstance(dim, int) or dim == 0: + why( + node, + reason=f"output tensor has invalid shape, dim: {dim} of type {type(dim)}. Expecting non-zero, int values.", + ) return False return True @@ -431,7 +496,14 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: return False mask_node = node.all_input_nodes[3] mask_rank = mask_node.meta["val"].dim() - return mask_rank == 2 + if mask_rank != 2: + why( + node, + reason=f"mask must have rank 2, got mask of rank {mask_rank}", + ) + return False + + return True def get_original_aten(self) -> Optional[torch._ops.OpOverload]: return torch.ops.aten.scaled_dot_product_attention.default diff --git a/backends/xnnpack/partition/config/node_configs.py b/backends/xnnpack/partition/config/node_configs.py index 501216eaae3..2449d9d6440 100644 --- a/backends/xnnpack/partition/config/node_configs.py +++ b/backends/xnnpack/partition/config/node_configs.py @@ -4,6 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import logging import operator from typing import List, Optional @@ -19,8 +20,12 @@ from executorch.exir.backend.canonical_partitioners.config_partitioner import ( format_target_name, ) +from executorch.exir.backend.utils import WhyNoPartition from torch.export import ExportedProgram +logger = logging.getLogger(__name__) +why = WhyNoPartition(logger=logger) + class BatchNormConfig(XNNPartitionerConfig): target_name = "_native_batch_norm_legit_no_training.default" @@ -38,9 +43,15 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: conv_name = format_target_name(conv.target.__name__) # pyre-ignore if conv_name not in ["convolution.default"]: + why(node, f"Invalid conv target {conv_name}") + return False + + can_fuse = FuseBatchNormWithConvPass.can_fuse(conv, bn, ep) + if not can_fuse: + why(node, "BatchNorm cannot be fused with Convolution") return False - return FuseBatchNormWithConvPass.can_fuse(conv, bn, ep) + return True def get_node_and_deps( self, node: torch.fx.Node, ep: ExportedProgram @@ -74,17 +85,25 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: supported_dtypes = {torch.float32, torch.float16, torch.int8, torch.qint8} node_val = node.meta.get("val") output_0 = node_val[0] + + input_node = node.all_input_nodes[0] + if len(input_node.meta.get("val").shape) != 4: + why(node, f"Unsupported input rank {input_node.meta.get('val').shape}") + return False # Don't check indicies dtype if output_0.dtype not in supported_dtypes: + why(node, f"Unsupported output dtype {output_0.dtype}") return False max_input = node.all_input_nodes[0] if max_input.meta.get("val").dtype not in supported_dtypes: + why(node, f"Unsupported input dtype {max_input.meta.get('val').dtype}") return False # Make sure that all users are getitems of the first output for user in node.users: if not (user.target == operator.getitem and user.args[1] == 0): + why(node, "Unsupported user of max.dim") return False return True @@ -111,7 +130,11 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: return False weight = node.all_input_nodes[1] - return is_param_node(ep, weight) + is_param = is_param_node(ep, weight) + if not is_param: + why(node, "Prelu weight must be a parameter") + return False + return True def get_original_aten(self) -> Optional[torch._ops.OpOverload]: return torch.ops.aten.prelu.default diff --git a/backends/xnnpack/partition/config/quant_affine_configs.py b/backends/xnnpack/partition/config/quant_affine_configs.py new file mode 100644 index 00000000000..d9e789104b6 --- /dev/null +++ b/backends/xnnpack/partition/config/quant_affine_configs.py @@ -0,0 +1,65 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import List, Optional + +import torch +from executorch.backends.xnnpack.partition.config.xnnpack_config import ( + ConfigPrecisionType, + XNNPartitionerConfig, +) +from torch.export import ExportedProgram + + +class QDQAffineConfigs(XNNPartitionerConfig): + def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: + return True + + def get_node_and_deps( + self, node: torch.fx.Node, ep: ExportedProgram + ) -> List[torch.fx.Node]: + # Do not return anything from this because we only use this to + # preserve the decomposition + return [] + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.DYNAMIC_QUANT] + + +class QuantizeAffineConfig(QDQAffineConfigs): + target_name = "quantize_affine.default" + + def get_original_aten(self) -> Optional[torch._ops.OpOverload]: + try: + import torchao.quantization.quant_primitives # noqa + + return torch.ops.quant.quantize_affine.default + except: + return None + + +class DeQuantizeAffineConfig(QDQAffineConfigs): + target_name = "dequantize_affine.default" + + def get_original_aten(self) -> Optional[torch._ops.OpOverload]: + try: + import torchao.quantization.quant_primitives # noqa + + return torch.ops.quant.dequantize_affine.default + except: + return None + + +class ChooseQParamsAffineConfig(QDQAffineConfigs): + target_name = "choose_qparams_affine.default" + + def get_original_aten(self) -> Optional[torch._ops.OpOverload]: + try: + import torchao.quantization.quant_primitives # noqa + + return torch.ops.quant.choose_qparams_affine.default + except: + return None diff --git a/backends/xnnpack/partition/config/xnnpack_config.py b/backends/xnnpack/partition/config/xnnpack_config.py index 840ffbd43b4..d261416a76f 100644 --- a/backends/xnnpack/partition/config/xnnpack_config.py +++ b/backends/xnnpack/partition/config/xnnpack_config.py @@ -4,6 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import logging from abc import abstractmethod from enum import Enum from typing import List, Optional @@ -13,8 +14,12 @@ format_target_name, PartitionerConfig, ) +from executorch.exir.backend.utils import WhyNoPartition from torch.export import ExportedProgram +logger = logging.getLogger(__name__) +why = WhyNoPartition(logger=logger) + class ConfigPrecisionType(Enum): FP32 = 1 @@ -22,7 +27,6 @@ class ConfigPrecisionType(Enum): DYNAMIC_QUANT = 3 -# TODO: add WhyNotPartition to XNNPartitionerConfig class XNNPartitionerConfig(PartitionerConfig): """ Base partitioner config for XNNPACK Partitioner Configs. Base wrapper class @@ -33,9 +37,11 @@ class XNNPartitionerConfig(PartitionerConfig): types they want to enable """ - def __init__(self): + def __init__(self, **kwargs): super().__init__() self.enabled_precision_types = self.supported_precision_types() + # Flag used in GEMMConfig() + self.force_fp32_dynamic_linear = kwargs.get("force_fp32_dynamic_linear", False) def get_partition( self, node: torch.fx.Node, ep: ExportedProgram @@ -125,10 +131,12 @@ def check_common_constraints( ) if len(self.enabled_precision_types) == 0: + why(node, reason="not enabled precision types") return False has_valid_dtypes = self._check_node_has_valid_dtype(node) if not has_valid_dtypes: + why(node, reason="invalid dtype") return False return True diff --git a/backends/xnnpack/partition/xnnpack_partitioner.py b/backends/xnnpack/partition/xnnpack_partitioner.py index f582ea753f4..700c7d1b753 100644 --- a/backends/xnnpack/partition/xnnpack_partitioner.py +++ b/backends/xnnpack/partition/xnnpack_partitioner.py @@ -5,6 +5,8 @@ # LICENSE file in the root directory of this source tree. import itertools + +import logging from typing import List, Optional, Type, Union from executorch.backends.xnnpack.partition.config import ALL_PARTITIONER_CONFIGS @@ -21,6 +23,9 @@ from executorch.exir.backend.partitioner import DelegationSpec from torch.fx.passes.infra.partitioner import Partition +logging.basicConfig(level=logging.WARNING) +logger = logging.getLogger(__name__) + class XnnpackPartitioner(ConfigerationBasedPartitioner): def __init__( @@ -30,7 +35,17 @@ def __init__( Union[ConfigPrecisionType, List[ConfigPrecisionType]] ] = None, per_op_mode=False, + verbose: bool = False, + **kwargs, ): + """ + @verbose: if True, print out more information about the partitioner. + Default level is WARNING. If verbose is True, level is set to DEBUG. + """ + if verbose: + logger.setLevel(logging.DEBUG) + logger.debug("Verbose logging enabled for XNNPACK partitioner.") + delegation_spec = DelegationSpec(XnnpackBackend.__name__, []) configs_to_use = configs or ALL_PARTITIONER_CONFIGS # Can do logic and have extra args to filter/delete/select @@ -41,7 +56,7 @@ def __init__( for config in configs_to_use: # Config Classes given to XnnpackPartitioner should no longer be abstract - initialized = config() # pyre-ignore + initialized = config(**kwargs) # pyre-ignore initialized.set_enabled_precision_types(config_precisions) initialized_configs.append(initialized) diff --git a/backends/xnnpack/passes/TARGETS b/backends/xnnpack/passes/TARGETS index e91614c735b..6bc3742abe6 100644 --- a/backends/xnnpack/passes/TARGETS +++ b/backends/xnnpack/passes/TARGETS @@ -30,6 +30,7 @@ python_library( "//executorch/exir:pass_base", "//executorch/exir/dialects:lib", "//executorch/exir/passes:const_prop_pass", + "//executorch/exir/passes:memory_format_ops_pass", "//executorch/exir/program:program", ], ) diff --git a/backends/xnnpack/passes/channels_last_tagged_reshape_pass.py b/backends/xnnpack/passes/channels_last_tagged_reshape_pass.py index f1f9a69acca..692f1a9d145 100644 --- a/backends/xnnpack/passes/channels_last_tagged_reshape_pass.py +++ b/backends/xnnpack/passes/channels_last_tagged_reshape_pass.py @@ -124,7 +124,7 @@ def create_call_function_node( "call_function", target=target, args=args, - kwargs=( + kwargs=( # pyre-fixme[6] {"memory_format": memory_format} if memory_format is not None else {} ), ) diff --git a/backends/xnnpack/passes/convert_to_linear.py b/backends/xnnpack/passes/convert_to_linear.py index 69f882523c8..2cef71bf927 100644 --- a/backends/xnnpack/passes/convert_to_linear.py +++ b/backends/xnnpack/passes/convert_to_linear.py @@ -13,9 +13,8 @@ from executorch.backends.transforms.addmm_mm_to_linear import ( apply_addmm_mm_to_linear_transform, ) -from executorch.backends.xnnpack.passes.xnnpack_pass import XNNPACKPass -from executorch.backends.xnnpack.utils.utils import is_param_node from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass from torch.fx.passes.infra.pass_base import PassResult from torch.fx.passes.utils.source_matcher_utils import ( @@ -27,7 +26,7 @@ logger.setLevel(logging.WARNING) -class ConvertToLinearPass(XNNPACKPass): +class ConvertToLinearPass(ExportPass): linear_modules = [ torch.nn.Linear, torch.nn.functional.linear, @@ -71,28 +70,24 @@ def get_arg(node: torch.fx.Node, arg: str): map_ = {"input": 0, "weight": 1} return None if arg == "bias" else node.args[map_[arg]] - def find_bias_for_mm(self, src_partition: SourcePartition, weight: torch.fx.Node): + def find_bias_for_mm(self, src_partition: SourcePartition, mm_node: torch.fx.Node): """ For linear decomposed with mm + add, find bias in src partition """ - out_channels = get_shape(weight)[0] - bias = None - - # Try to find bias node in all nodes - for node in src_partition.nodes: - if is_param_node(self.exported_program, node) and node != weight: - bias = node - - if bias is not None: - assert get_shape(bias) == [ - out_channels - ], f"Expected bias shape {[out_channels]} but got {get_shape(bias)}" - else: - assert exir_ops.edge.aten.add.Tensor not in [ - node.target for node in src_partition.nodes - ], f"Expecting to find bias for Linear module: {src_partition} but could not find it" - return bias + mm_users = list(mm_node.users.keys()) + if len(mm_users) != 1: + return None + + add_node = mm_users[0] + if add_node.target != exir_ops.edge.aten.add.Tensor: + return None + + for arg in add_node.all_input_nodes: + if arg != mm_node and arg in src_partition.input_nodes: + return arg + + return None def create_linear( self, @@ -119,7 +114,7 @@ def create_linear( src_partition.input_nodes + src_partition.params, # bias can be in params ) if linear_bias is None and node.target == exir_ops.edge.aten.mm.default: - linear_bias = self.find_bias_for_mm(src_partition, linear_weight) + linear_bias = self.find_bias_for_mm(src_partition, node) logger.debug(f"Found bias(?): {linear_bias} from node {node}") diff --git a/backends/xnnpack/passes/convert_to_sdpa.py b/backends/xnnpack/passes/convert_to_sdpa.py index 76bb24cc949..97aca5491dd 100644 --- a/backends/xnnpack/passes/convert_to_sdpa.py +++ b/backends/xnnpack/passes/convert_to_sdpa.py @@ -83,7 +83,7 @@ def create_sdpa( kwargs={"scale": scale}, ) - sdpa_node.meta["val"] = sdpa_node.target( + sdpa_node.meta["val"] = sdpa_node.target( # pyre-fixme[29] *[n.meta["val"] for n in match.placeholder_nodes], scale=scale, ) diff --git a/backends/xnnpack/passes/tag_implicit_q_dq_pass.py b/backends/xnnpack/passes/tag_implicit_q_dq_pass.py index 0aa2e1291e3..ac6ccc9b89d 100644 --- a/backends/xnnpack/passes/tag_implicit_q_dq_pass.py +++ b/backends/xnnpack/passes/tag_implicit_q_dq_pass.py @@ -12,7 +12,11 @@ SUPPORTED_IMPLICIT_Q_DQ_OP_NAMES_SET, ) from executorch.backends.xnnpack.passes.xnnpack_pass import XNNPACKPass -from executorch.backends.xnnpack.utils.quant_utils import is_dequant, is_quant +from executorch.backends.xnnpack.utils.quant_utils import ( + is_dequant, + is_dynamic_qdq, + is_quant, +) from executorch.backends.xnnpack.utils.utils import is_param_node from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import PassResult @@ -76,18 +80,7 @@ def is_output_node(self, node: torch.fx.Node) -> bool: return node.op == "output" def is_dynamically_quantized(self, node: torch.fx.Node) -> bool: - return any( - is_dequant(input_node) - and ( - cast( - torch._ops.OpOverload, input_node.target - )._schema.schema.overload_name - == "tensor" - or input_node.target - == exir_ops.edge.quantized_decomposed.dequantize_per_token.default - ) - for input_node in node.all_input_nodes - ) + return is_dynamic_qdq(node) def is_supported_quant_op(self, node: torch.fx.Node) -> bool: return ( @@ -191,7 +184,7 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult: ending_implicit_q_nodes = [] for user in first_node.users: - if self.is_dynamically_quantized(user): + if self.is_dynamically_quantized(first_node): # if the dq is a dynamic dq, then it is implicit break user_end_nodes = self.get_ending_implicit_q_nodes(user) diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp index ac53831b04c..2145ea15199 100644 --- a/backends/xnnpack/runtime/XNNCompiler.cpp +++ b/backends/xnnpack/runtime/XNNCompiler.cpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include @@ -21,6 +21,25 @@ namespace executor { namespace xnnpack { namespace delegate { +/* + * Provide compile-time allocation. + */ +class CompileAllocator { + public: + /* + * Allocate memory which will be automatically freed at the end + * of the compilation process. + */ + void* allocateTemporary(size_t size) { + auto mem = new uint8_t[size]; + temporaries_.emplace_back(mem); + return mem; + } + + private: + std::vector> temporaries_; +}; + // Flatbuffer types using ValuePtr = const fb_xnnpack::XValue*; using NodePtr = const fb_xnnpack::XNode*; @@ -35,6 +54,23 @@ using DefineNodeFunc = Error (*)( const std::unordered_map&, NodePtr) noexcept; +/* +Convert a tensor from fp32 to bf16. +*/ +void convertF32TensorToBF16( + const float* f32_data, + uint16_t* bf16_data_out, + size_t numel) { + for (auto i = 0u; i < numel; i++) { + // Adjust the f32 value such that it rounds properly after truncation. + // Constant factor scales 1+2^-8 to 1+2e-7. + float f32_adjusted = f32_data[i] * 1.00389105f; + uint32_t f32_bits; + memcpy(&f32_bits, &f32_adjusted, sizeof(float)); + bf16_data_out[i] = static_cast(f32_bits >> 16); + } +} + /* Gets the output min and output max for a given node operator */ @@ -152,7 +188,8 @@ Error defineTensor( GraphPtr flatbuffer_graph, const uint8_t* constant_data_ptr, std::vector& input_ids, - std::vector& output_ids) { + std::vector& output_ids, + CompileAllocator& allocator) { const fb_xnnpack::XNNTensorValue* tensor_value = nullptr; const fb_xnnpack::XNNQuantizedTensorValue* qtensor_value = nullptr; @@ -356,12 +393,31 @@ Error defineTensor( size_t group_size = qparams->group_size(); size_t output_channels = tensor_value->dims()->Get(0); size_t input_channels = tensor_value->dims()->Get(1); + + const uint16_t* scale_data = nullptr; + uint32_t scale_numel = 0; + + // Block scales are preferably serialized as bf16 but can also be + // serialized as fp32 for backwards compatability. + if (qparams->scale_bf16() != nullptr) { + scale_data = + static_cast(qparams->scale_bf16()->data()); + scale_numel = qparams->scale_bf16()->size(); + } else { + // Read fp32 scales, convert to bf16. + auto conv_buffer = static_cast(allocator.allocateTemporary( + qparams->scale()->size() * sizeof(uint16_t))); + scale_numel = qparams->scale()->size(); + convertF32TensorToBF16( + qparams->scale()->data(), conv_buffer, scale_numel); + scale_data = conv_buffer; + } + ET_CHECK_OR_RETURN_ERROR( - qparams->scale()->size() == - output_channels * input_channels / group_size, + scale_numel == output_channels * input_channels / group_size, Internal, "scale size %zu != output channels %zu * group size %zu", - (size_t)qparams->scale()->size(), + static_cast(scale_numel), output_channels, group_size); int32_t zero_point = @@ -370,18 +426,19 @@ Error defineTensor( Debug, "define quant tensor (per channel group): buffer_ptr: %p, scale.numel(): %u, channel_dim: %u, grpup_size: %zu, output_channels: %zu, dtype: %u, zero_point: %d, datatype: %d\n", buffer_ptr, - qparams->scale()->size(), + scale_numel, qparams->channel_dim(), group_size, output_channels, datatype, zero_point, datatype); + status = xnn_define_blockwise_quantized_tensor_value( /*subgraph=*/subgraph_ptr, /*datatype=*/datatype, /*zero_point=*/zero_point, - /*scale=*/qparams->scale()->data(), + /*scale=*/scale_data, /*num_dims=*/tensor_value->num_dims(), /*channel_dim=*/qparams->channel_dim(), /*block_size=*/qparams->group_size(), @@ -1617,6 +1674,7 @@ ET_NODISCARD Error XNNCompiler::compileModel( Result header = XNNHeader::Parse(buffer_pointer, num_bytes); const uint8_t* flatbuffer_data = nullptr; const uint8_t* constant_data = nullptr; + CompileAllocator compile_allocator; // Header status can only either be Error::Ok or Error::NotFound if (header.ok()) { @@ -1688,7 +1746,8 @@ ET_NODISCARD Error XNNCompiler::compileModel( flatbuffer_graph, constant_data, input_ids, - output_ids); + output_ids, + compile_allocator); if (err != Error::Ok) { return err; diff --git a/backends/xnnpack/runtime/XNNPACKBackend.cpp b/backends/xnnpack/runtime/XNNPACKBackend.cpp index 264dc838720..c817c010e29 100644 --- a/backends/xnnpack/runtime/XNNPACKBackend.cpp +++ b/backends/xnnpack/runtime/XNNPACKBackend.cpp @@ -20,7 +20,7 @@ namespace torch { namespace executor { -class XnnpackBackend final : public PyTorchBackendInterface { +class XnnpackBackend final : public ::executorch::runtime::BackendInterface { public: ~XnnpackBackend() = default; diff --git a/backends/xnnpack/serialization/runtime_schema.fbs b/backends/xnnpack/serialization/runtime_schema.fbs index f32e7c60637..efe717e085e 100644 --- a/backends/xnnpack/serialization/runtime_schema.fbs +++ b/backends/xnnpack/serialization/runtime_schema.fbs @@ -63,6 +63,7 @@ table PerChannelGroupQuant { scale:[float]; channel_dim:int; group_size:int; + scale_bf16:[ushort]; } table XNNTensorValue { diff --git a/backends/xnnpack/serialization/schema.fbs b/backends/xnnpack/serialization/schema.fbs index 773a459bbf6..33571195d63 100644 --- a/backends/xnnpack/serialization/schema.fbs +++ b/backends/xnnpack/serialization/schema.fbs @@ -48,6 +48,7 @@ table PerChannelGroupQuant { scale:[float]; channel_dim:int; group_size:int; + scale_bf16:[ushort]; } table PerChannelQuant { diff --git a/backends/xnnpack/targets.bzl b/backends/xnnpack/targets.bzl index 4fd0ee519cb..633808dcfe5 100644 --- a/backends/xnnpack/targets.bzl +++ b/backends/xnnpack/targets.bzl @@ -36,10 +36,10 @@ def define_common_targets(): "@EXECUTORCH_CLIENTS", ], preprocessor_flags = [ + # Enable workspace sharing across delegates + "-DENABLE_XNNPACK_SHARED_WORKSPACE", # Uncomment to enable per operator timings # "-DENABLE_XNNPACK_PROFILING", - # Uncomment to enable workspace sharing across delegates - # "-DENABLE_XNNPACK_SHARED_WORKSPACE" ], exported_deps = [ "//executorch/runtime/backend:interface", @@ -47,7 +47,7 @@ def define_common_targets(): deps = [ third_party_dep("XNNPACK"), "//executorch/backends/xnnpack/serialization:xnnpack_flatbuffer_header", - "//executorch/backends/xnnpack/threadpool:threadpool", + "//executorch/extension/threadpool:threadpool", "//executorch/runtime/core/exec_aten/util:tensor_util", ], # XnnpackBackend.cpp needs to compile with executor as whole diff --git a/backends/xnnpack/test/CMakeLists.txt b/backends/xnnpack/test/CMakeLists.txt index d0fbddae237..02852871fe0 100644 --- a/backends/xnnpack/test/CMakeLists.txt +++ b/backends/xnnpack/test/CMakeLists.txt @@ -23,8 +23,10 @@ include(${EXECUTORCH_ROOT}/build/Test.cmake) set(_test_srcs # We can't put runtime/test_runtime_utils.cpp because we don't # build aten - runtime/test_xnnexecutor.cpp ../threadpool/threadpool.cpp - ../threadpool/threadpool_guard.cpp ../threadpool/test/threadpool_test.cpp + runtime/test_xnnexecutor.cpp + ${EXECUTORCH_ROOT}/extension/threadpool/threadpool.cpp + ${EXECUTORCH_ROOT}/extension/threadpool/threadpool_guard.cpp + ${EXECUTORCH_ROOT}/extension/threadpool/test/threadpool_test.cpp ) et_cxx_test( @@ -32,6 +34,7 @@ et_cxx_test( SOURCES ${_test_srcs} EXTRA_LIBS + extension_threadpool xnnpack_backend XNNPACK pthreadpool diff --git a/backends/xnnpack/test/TARGETS b/backends/xnnpack/test/TARGETS index abedffb8e61..629ac8275bc 100644 --- a/backends/xnnpack/test/TARGETS +++ b/backends/xnnpack/test/TARGETS @@ -36,10 +36,10 @@ runtime.python_test( deps = [ "//executorch/backends/xnnpack/partition:xnnpack_partitioner", "//executorch/backends/xnnpack/test/tester:tester", + "//executorch/devtools:lib", + "//executorch/devtools/bundled_program:config", + "//executorch/devtools/bundled_program/serialize:lib", "//executorch/exir/passes:constant_prop_pass", - "//executorch/sdk:lib", - "//executorch/sdk/bundled_program:config", - "//executorch/sdk/bundled_program/serialize:lib", "//pytorch/ao:torchao", # @manual ], external_deps = [ diff --git a/backends/xnnpack/test/ops/linear.py b/backends/xnnpack/test/ops/linear.py index a9459050e79..d8de79f283d 100644 --- a/backends/xnnpack/test/ops/linear.py +++ b/backends/xnnpack/test/ops/linear.py @@ -26,8 +26,167 @@ ) from torch.ao.quantization.quantizer.xnnpack_quantizer_utils import QuantizationConfig +try: + from torchao.quantization.quant_api import ( + int8_dynamic_activation_int4_weight, + quantize_, + unwrap_tensor_subclass, + ) + + torchao_installed = True +except: + torchao_installed = False + + +# Pytorch Modules Used for Testing +class BaseLinear(torch.nn.Module): + def __init__( + self, + in_size: int = 2, + input_channels: int = 4, + output_channels: int = 4, + dtype: torch.dtype = torch.float, + use_bias: bool = False, + ): + super().__init__() + self.linear = torch.nn.Linear( + input_channels, output_channels, bias=use_bias + ).to(dtype=dtype) + + self.ic = input_channels + self.oc = output_channels + + assert dtype in [torch.float, torch.half], "Unsupported op dtype" + self.op_dtype = dtype + self.in_size = in_size + + def forward(self, x): + return self.linear(x) + + def get_inputs(self): + return (torch.randn(1, self.in_size, self.ic).to(self.op_dtype),) + + +class AddMMModule(torch.nn.Module): + def __init__(self, in_size, out_size): + super().__init__() + self.mat = torch.nn.Parameter(torch.randn(in_size, out_size)) + self.bias = torch.nn.Parameter(torch.randn(1, out_size)) + + def forward(self, x): + return torch.addmm(self.bias, x, self.mat) + + +class LinearReluModule(torch.nn.Module): + def __init__(self, in_size, out_size, use_bias, dtype=torch.float): + super().__init__() + self.dtype = dtype + self.linear = torch.nn.Linear(in_size, out_size, bias=use_bias).to(dtype=dtype) + + def forward(self, x): + return torch.nn.functional.relu(self.linear(x)) + + def get_inputs(self): + return (torch.randn(1, self.in_size, self.ic).to(self.op_dtype),) + + +class LinearParallelSequentialModule(torch.nn.Module): + def __init__( + self, + in_size=2, + input_size=4, + intermediate_size=5, + output_size=3, + dtype=torch.float, + ): + super().__init__() + self.linear1_weight = torch.nn.Parameter( + torch.rand(intermediate_size, input_size) + ) + self.linear1_bias = torch.nn.Parameter(torch.rand(intermediate_size)) + + self.linear2_weight = torch.nn.Parameter( + torch.rand(intermediate_size, input_size) + ) + self.linear2_bias = torch.nn.Parameter(torch.rand(intermediate_size)) + + self.linear3_weight = torch.nn.Parameter( + torch.rand(output_size, intermediate_size) + ) + self.linear3_bias = torch.nn.Parameter(torch.rand(output_size)) + self.in_size = in_size + self.input_size = input_size + self.dtype = torch.float + + def forward(self, x, y): + a = torch.nn.functional.linear(x, self.linear1_weight, self.linear1_bias) + b = torch.nn.functional.linear(y, self.linear2_weight, self.linear2_bias) + c = torch.nn.functional.linear(b, self.linear3_weight, self.linear3_bias) + return (a, c) + + def get_inputs(self): + return ( + torch.rand(self.in_size, self.input_size, dtype=self.dtype), + torch.rand(self.in_size, self.input_size, dtype=self.dtype), + ) + + +class LinearSequential(torch.nn.Module): + def __init__( + self, + in_size=2, + input_size=4, + intermediate_size=5, + output_size=3, + dtype=torch.float, + ): + super().__init__() + self.linear1_weight = torch.nn.Parameter( + torch.rand(intermediate_size, input_size) + ) + self.linear1_bias = torch.nn.Parameter(torch.rand(intermediate_size)) + + self.linear2_weight = torch.nn.Parameter( + torch.rand(output_size, intermediate_size) + ) + self.linear2_bias = torch.nn.Parameter(torch.rand(output_size)) + self.in_size = in_size + self.input_size = input_size + self.dtype = torch.float + + def forward(self, x): + a = torch.nn.functional.linear(x, self.linear1_weight, self.linear1_bias) + b = torch.nn.functional.linear(a, self.linear2_weight, self.linear2_bias) + return b + + def get_inputs(self): + return (torch.rand(self.in_size, self.input_size, dtype=torch.float),) + class TestLinear(unittest.TestCase): + """ + Test Class for XNNPACK Linear Operators. + + Notes: + - XNNPACK Does not support Per Tensor Quantized Weights with Dynamic Activations + - XNNPACK Only supports Per-Token Activation, so Dynamic per-tensor Quantization + As done by the default dynamic quantization flow does Per-Token Quantization + Activation under the hood, where the torch.nn.Module is doing Per-Tensor Quantization + on the Activation. This is sufficient because Per-Token Quantization on Activations + should produce strictly better results compared to Per-Tensor Quantization + """ + + @staticmethod + def _get_4b_dqconfig() -> QuantizationConfig: + # Returns a QuantizationConfig for 4b dynamic quantization for XNNPACK. + qconfig: QuantizationConfig = get_symmetric_quantization_config( + is_per_channel=True, + is_dynamic=True, + weight_qmin=-8, + weight_qmax=7, + ) + return qconfig + def test_fp16_linear(self): for use_bias in (True, False): for num_batch_dims in range(1, 3): @@ -65,33 +224,13 @@ def test_qc8_linear(self): ) def test_fp32_addmm(self): - """ - Note that the ConvertToLinear pass requires the weight matrix to be transposed. - """ - - class AddMMModule(torch.nn.Module): - def __init__(self, in_size, out_size): - super().__init__() - self.mat = torch.nn.Parameter(torch.randn(in_size, out_size)) - self.bias = torch.nn.Parameter(torch.randn(1, out_size)) - - def forward(self, x): - return torch.addmm(self.bias, x, self.mat) - + # Note that the ConvertToLinear pass requires the weight matrix to be transposed. self._test_linear( lambda in_size, out_size: AddMMModule(in_size, out_size), uses_bias=True, ) def test_fp32_linear_fused_relu(self): - class LinearReluModule(torch.nn.Module): - def __init__(self, in_size, out_size, use_bias): - super().__init__() - self.linear = torch.nn.Linear(in_size, out_size, bias=use_bias) - - def forward(self, x): - return torch.nn.functional.relu(self.linear(x)) - for use_bias in (True, False): for num_batch_dims in range(1, 3): self._test_linear( @@ -105,14 +244,6 @@ def forward(self, x): ) def test_qs8_linear_fused_relu(self): - class LinearReluModule(torch.nn.Module): - def __init__(self, in_size, out_size, use_bias): - super().__init__() - self.linear = torch.nn.Linear(in_size, out_size, bias=use_bias) - - def forward(self, x): - return torch.nn.functional.relu(self.linear(x)) - for use_bias in (True, False): for num_batch_dims in range(1, 3): self._test_linear( @@ -138,21 +269,6 @@ def test_qs8_linear(self): quant_type="per_tensor", ) - @unittest.skip("XNNPACK currently only supports per-channel dynamic quantization.") - def _test_qd8_per_tensor_linear(self): - for uses_bias in (False, True): - inputs = (torch.randn(2, 4),) - module = torch.nn.Linear(4, 5, bias=uses_bias) - dynamic_shapes = ({0: torch.export.Dim("batch", max=100)},) - - self._test_dqlinear( - module, - inputs, - dynamic_shapes=dynamic_shapes, - is_per_channel=False, - uses_bias=uses_bias, - ) - def test_qd8_per_channel_linear(self): for uses_bias in (False, True): inputs = (torch.randn(2, 4),) @@ -166,19 +282,6 @@ def test_qd8_per_channel_linear(self): uses_bias=uses_bias, ) - @staticmethod - def _get_4b_dqconfig() -> QuantizationConfig: - """ - Returns a QuantizationConfig for 4b dynamic quantization for XNNPACK. - """ - qconfig: QuantizationConfig = get_symmetric_quantization_config( - is_per_channel=True, - is_dynamic=True, - weight_qmin=-8, - weight_qmax=7, - ) - return qconfig - def test_qd8_per_channel_4w_linear(self): qconfig = self._get_4b_dqconfig() input_channels = [2, 63] @@ -267,38 +370,12 @@ def test_qd8_per_channel_linear_with_two_batch(self): ) def test_qd8_per_channel_linear_sequential(self): - in_size = 2 - input_size = 4 - intermediate_size = 5 - output_size = 3 - - class LinearSequential(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear1_weight = torch.nn.Parameter( - torch.rand(intermediate_size, input_size) - ) - self.linear1_bias = torch.nn.Parameter(torch.rand(intermediate_size)) - - self.linear2_weight = torch.nn.Parameter( - torch.rand(output_size, intermediate_size) - ) - self.linear2_bias = torch.nn.Parameter(torch.rand(output_size)) - - def forward(self, x): - a = torch.nn.functional.linear( - x, self.linear1_weight, self.linear1_bias - ) - b = torch.nn.functional.linear( - a, self.linear2_weight, self.linear2_bias - ) - return b - - inputs = (torch.rand(in_size, input_size, dtype=torch.float),) + lin_mod = LinearSequential() + inputs = lin_mod.get_inputs() dynamic_shapes = ({0: torch.export.Dim("batch", max=100)},) self._test_dqlinear( - LinearSequential(), + lin_mod, inputs, dynamic_shapes=dynamic_shapes, linear_count=2, @@ -307,53 +384,16 @@ def forward(self, x): atol=1e-1, ) - def test_qd8_per_channel_linear_parellel_and_sequential(self): - in_size = 2 - input_size = 4 - intermediate_size = 5 - output_size = 3 - - class LinearModule(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear1_weight = torch.nn.Parameter( - torch.rand(intermediate_size, input_size) - ) - self.linear1_bias = torch.nn.Parameter(torch.rand(intermediate_size)) - - self.linear2_weight = torch.nn.Parameter( - torch.rand(intermediate_size, input_size) - ) - self.linear2_bias = torch.nn.Parameter(torch.rand(intermediate_size)) - - self.linear3_weight = torch.nn.Parameter( - torch.rand(output_size, intermediate_size) - ) - self.linear3_bias = torch.nn.Parameter(torch.rand(output_size)) - - def forward(self, x, y): - a = torch.nn.functional.linear( - x, self.linear1_weight, self.linear1_bias - ) - b = torch.nn.functional.linear( - y, self.linear2_weight, self.linear2_bias - ) - c = torch.nn.functional.linear( - b, self.linear3_weight, self.linear3_bias - ) - return (a, c) - - inputs = ( - torch.rand(in_size, input_size, dtype=torch.float), - torch.rand(in_size, input_size, dtype=torch.float), - ) + def test_qd8_per_channel_linear_parallel_and_sequential(self): + lin_mod = LinearParallelSequentialModule() + inputs = lin_mod.get_inputs() dynamic_shapes = ( {0: torch.export.Dim("batch", max=100)}, {0: torch.export.Dim("batch2", max=100)}, ) self._test_dqlinear( - LinearModule(), + lin_mod, inputs, dynamic_shapes=dynamic_shapes, linear_count=3, @@ -362,90 +402,59 @@ def forward(self, x, y): atol=1e-1, ) - def test_qd8_fp32_per_token_weight_per_channel_int8(self): - self._run_manual_dqlinear_tests(8, torch.float) - - def test_qd8_fp32_per_token_weight_per_channel_int4(self): - self._run_manual_dqlinear_tests(4, torch.float) - - # This fails because the output tensor dtype is different, but if you squint and ignore that and look at the values, - # it is not too bad. - # Difference: max: 0.042601585388183594, abs: 0.042601585388183594. - # -- Model vs. Reference -- - # Numel: 68, 68 - # Median: -0.7754800915718079, -0.7755751013755798 - # Mean: -0.6128872036933899, -0.6143574714660645 - # Max: 12.518657684326172, 12.516003608703613 - # Min: -20.070953369140625, -20.077701568603516 - @unittest.skip("Need to fix the dq_per_channel output dtype") - def _test_qd8_fp16_per_token_weight_per_channel_int8(self): - self._run_manual_dqlinear_tests(8, torch.float16) - - @unittest.skip("Need to fix the dq_per_channel output dtype") - def _test_qd8_fp16_per_token_weight_per_channel_int4(self): - self._run_manual_dqlinear_tests(4, torch.float16) - + @unittest.skipIf( + not torchao_installed, "Per Channel Group Quantization Required TorchAO" + ) def test_qd8_fp32_per_token_weight_per_channel_group_int4(self): M_sizes = [1, 2, 17, 31] - K_sizes = [8, 32, 64, 128] - bl_sizes = [8, 16, 16, 32] + K_sizes = [32, 32, 64, 128] + bl_sizes = [32, 32, 32, 64] N_sizes = [2, 17, 92, 128] for use_bias in [True, False]: - for i, _ in enumerate(M_sizes): - M = int(M_sizes[i]) - K = int(K_sizes[i]) - N = int(N_sizes[i]) - bl = int(bl_sizes[i]) - mod = self.ManualDQLinear( + for M, K, bl, N in zip(M_sizes, K_sizes, bl_sizes, N_sizes): + lin_mod = BaseLinear( input_channels=K, output_channels=N, - weight_n_bit=4, dtype=torch.float, - group_size=bl, - force_groupwise_quant=True, use_bias=use_bias, ) inputs = (torch.randn(1, M, K),) - self._test_manual_dq_linear( - mod, - inputs, - weight_groupwise=True, - use_bias=use_bias, + self._test_groupwise_dq_linear( + lin_mod, inputs, group_size=bl, use_bias=use_bias ) - @unittest.skip("Need to fix the dq_per_channel_group output dtype") - def _test_qd8_fp16_per_token_weight_per_channel_group_int4(self): + @unittest.skipIf( + not torchao_installed, "Per Channel Group Quantization Required TorchAO" + ) + def test_qd8_fp16_per_token_weight_per_channel_group_int4(self): M_sizes = [1, 2, 17, 31] - K_sizes = [8, 32, 64, 128] - bl_sizes = [8, 16, 16, 32] + K_sizes = [32, 32, 64, 128] + bl_sizes = [32, 32, 32, 64] N_sizes = [2, 17, 92, 128] for use_bias in [True, False]: - for i, _ in enumerate(M_sizes): - M = int(M_sizes[i]) - K = int(K_sizes[i]) - N = int(N_sizes[i]) - bl = int(bl_sizes[i]) - mod = self.ManualDQLinear( + for M, K, bl, N in zip(M_sizes, K_sizes, bl_sizes, N_sizes): + lin_mod = BaseLinear( + in_size=M, input_channels=K, output_channels=N, - weight_n_bit=4, dtype=torch.float16, - group_size=bl, - force_groupwise_quant=True, use_bias=use_bias, ) - inputs = (torch.randn(1, M, K, dtype=torch.float16),) - self._test_manual_dq_linear( - mod, - inputs, - weight_groupwise=True, - use_bias=use_bias, - atol=0.1, - rtol=0.1, + inputs = lin_mod.get_inputs() + # This requires slightly higher atol, but if you look at error it is not that bad: + # Difference: max: 0.00140380859375, abs: 0.00140380859375, mean abs error: 0.00042724609375. + # -- Model vs. Reference -- + # Numel: 4, 4 + # Median: -0.05023193359375, -0.0516357421875 + # Mean: 0.2373046875, 0.237060546875 + # Max: 1.0078125, 1.0078125 + # Min: -0.08465576171875, -0.08441162109375 + self._test_groupwise_dq_linear( + lin_mod, inputs, group_size=bl, use_bias=use_bias, atol=1e-2 ) def _test_linear( @@ -467,7 +476,20 @@ def _test_linear( input_sizes = [4, 37, 17] output_sizes = [4, 17, 37] - quant = quant_type is not None + quant_config = None + if quant_type is not None: + if quant_type == "per_channel": + quant_config = get_symmetric_quantization_config( + is_per_channel=True, + is_dynamic=False, + ) + elif quant_type == "per_tensor": + quant_config = get_symmetric_quantization_config( + is_per_channel=False, + is_dynamic=False, + ) + else: + raise ValueError(f"Unsupported quant type {quant_type}") """ Note that torch.nn.Linear maps to aten.mm.default (no bias) or aten.addmm.default (bias), @@ -478,7 +500,6 @@ def _test_linear( input_size = int(input_sizes[i]) output_size = int(output_sizes[i]) input_shape = [in_size] * num_batch_dims + [input_size] - print(f"Testing input_shape {input_shape} with {output_size} out_channels") module = make_module(input_size, output_size).eval().to(dtype) inputs = (torch.randn(input_shape).to(dtype),) @@ -487,28 +508,15 @@ def _test_linear( dynamic_shape[i] = torch.export.Dim(f"batch{i}", min=2, max=in_size) dynamic_shape = (dynamic_shape,) - print(dynamic_shape) for legacy_mode in (True, False): tester = Tester(module, inputs, dynamic_shapes=dynamic_shape) - if quant: - if quant_type == "per_channel": - quant_config = get_symmetric_quantization_config( - is_per_channel=True, - is_dynamic=False, - ) - elif quant_type == "per_tensor": - quant_config = get_symmetric_quantization_config( - is_per_channel=False, - is_dynamic=False, - ) - else: - raise ValueError(f"Unsupported quant type {quant_type}") + if quant_config: tester.quantize(Quantize(quantization_config=quant_config)) tester.export() - if quant: + if quant_config: tester.check(["torch.ops.quantized_decomposed"]) if legacy_mode: @@ -522,12 +530,19 @@ def _test_linear( ) tester.check_not([edge_op]) - if quant: - tester.check_not([edge_op, "torch.ops.quantized_decomposed"]) + if quant_config: + tester.check_not( + [ + "executorch_exir_dialects_edge__ops_aten_mm_default", + "executorch_exir_dialects_edge__ops_aten_addmm_default", + ] + ) tester.to_executorch() tester.serialize() - tester.run_method_and_compare_outputs(qtol=quant, atol=atol) + tester.run_method_and_compare_outputs( + qtol=bool(quant_config), atol=atol + ) def _test_dqlinear( self, @@ -540,24 +555,19 @@ def _test_dqlinear( qconfig: Optional[QuantizationConfig] = None, atol=5e-02, ): - edge_op = ( - "executorch_exir_dialects_edge__ops_aten_addmm_default" - if uses_bias - else "executorch_exir_dialects_edge__ops_aten_mm_default" - ) - quant_config = qconfig or get_symmetric_quantization_config( is_per_channel=is_per_channel, is_dynamic=True, ) for legacy_partitioner in (True, False): for per_op_mode in (True, False): - tester = Tester(module, inputs, dynamic_shapes=dynamic_shapes) - tester.quantize(Quantize(quantization_config=quant_config)) DynamicallyQuantizedPartitioner = XnnpackPartitioner( config_precisions=ConfigPrecisionType.DYNAMIC_QUANT, per_op_mode=per_op_mode, ) + + tester = Tester(module, inputs, dynamic_shapes=dynamic_shapes) + tester.quantize(Quantize(quantization_config=quant_config)) tester.export() if legacy_partitioner: @@ -567,357 +577,74 @@ def _test_dqlinear( tester.to_edge_transform_and_lower( ToEdgeTransformAndLower([DynamicallyQuantizedPartitioner]) ) - num_call_delegates = linear_count if per_op_mode else 1 tester.check_count( { - "torch.ops.higher_order.executorch_call_delegate": num_call_delegates + "torch.ops.higher_order.executorch_call_delegate": ( + linear_count if per_op_mode else 1 + ) } ) - tester.check_not([edge_op]) + tester.check_not( + [ + "executorch_exir_dialects_edge__ops_aten_mm_default", + "executorch_exir_dialects_edge__ops_aten_addmm_default", + ] + ) tester.to_executorch() tester.serialize() tester.run_method_and_compare_outputs(atol=atol) - class ManualDQLinear(torch.nn.Module): - def __init__( - self, - input_channels: int = 4, - output_channels: int = 4, - dtype: torch.dtype = torch.float, - weight_n_bit: int = 4, - group_size: int = 0, - force_groupwise_quant: bool = False, - use_bias: bool = False, - ): - super().__init__() - - self.ic = input_channels - self.oc = output_channels - - assert dtype in [torch.float, torch.half], "Unsupported op dtype" - self.op_dtype = dtype - - self.group_size = self.ic if group_size == 0 else group_size - self.num_groups = 1 - if self.group_size != self.ic: - assert self.ic % self.group_size == 0 - assert self.group_size % 8 == 0 # TODO make this 16 - self.num_groups = self.ic // self.group_size - - assert weight_n_bit in [4, 8], "Unsupported weight_n_bit" - self.w_n_bit = weight_n_bit - self.w_quant_min, self.w_quant_max = self.get_min_max(self.w_n_bit) - - self.w = torch.nn.Parameter( - torch.randn(self.oc, self.ic), requires_grad=False - ) - self.w_q = torch.nn.Parameter( - torch.zeros(self.oc, self.ic), requires_grad=False - ) - # Quantize the weights as per folded setup - if self.group_size != self.ic or force_groupwise_quant: - self.w_scales = torch.nn.Parameter( - torch.zeros(self.oc, self.num_groups), requires_grad=False - ) - self.w_zero_points = torch.nn.Parameter( - torch.zeros(self.oc, self.num_groups), requires_grad=False - ) - self.quant_weight_per_channel_group() - else: # per_channel quantization - self.w_scales = torch.nn.Parameter( - torch.zeros(self.oc), requires_grad=False - ) - self.w_zero_points = torch.nn.Parameter( - torch.zeros(self.oc), requires_grad=False - ) - self.quant_weight_per_channel() - - self.bias = ( - torch.nn.Parameter( - torch.randn(self.oc).to(self.op_dtype), requires_grad=False - ) - if use_bias - else None - ) - - def get_min_max(self, n_bit: int = 4): - max_int = 2 ** (n_bit - 1) - 1 - min_int = -(2 ** (n_bit - 1)) - return min_int, max_int - - def get_channel_qparams_symmetric( - self, - w: torch.Tensor, - n_bit: int = 4, - precision: torch.dtype = torch.float32, - ): - assert w.dim() == 2 - - to_quant = w.to(precision) - assert torch.isnan(to_quant).sum() == 0 - - max_val = to_quant.amax(dim=1, keepdim=True) - min_val = to_quant.amin(dim=1, keepdim=True) - min_val_neg = torch.min(min_val, torch.zeros_like(min_val)) - max_val_pos = torch.max(max_val, torch.zeros_like(max_val)) - - min_int, max_int = self.get_min_max(n_bit) - - max_val_abs = torch.max(-min_val_neg, max_val_pos) - scales = max_val_abs / (float(max_int - min_int) / 2) - scales = torch.max( - scales, torch.full_like(scales, torch.finfo(torch.float32).eps) - ) - zeros = torch.full_like(scales, 0) - return scales.to(precision).reshape(w.shape[0]), zeros.to( - precision - ).reshape(w.shape[0]).reshape(w.shape[0]) - - # Note: not using from torchao.quantization.quant_primitives because it will run into op registraion issues - def get_group_qparams_symmetric( - self, w, n_bit=4, groupsize=128, precision=torch.float32 - ): - # needed for GPTQ with padding - if groupsize > w.shape[-1]: - groupsize = w.shape[-1] - assert groupsize > 1 - assert w.shape[-1] % groupsize == 0 - assert w.dim() == 2 - - to_quant = w.reshape(-1, groupsize) - assert torch.isnan(to_quant).sum() == 0 - - max_val = to_quant.amax(dim=1, keepdim=True) - min_val = to_quant.amin(dim=1, keepdim=True) - min_val_neg = torch.min(min_val, torch.zeros_like(min_val)) - max_val_pos = torch.max(max_val, torch.zeros_like(max_val)) - - max_val_abs = torch.max(-min_val_neg, max_val_pos) - max_int = 2 ** (n_bit - 1) - 1 - min_int = -(2 ** (n_bit - 1)) - - scales = max_val_abs / (float(max_int - min_int) / 2) - scales = torch.max( - scales, torch.full_like(scales, torch.finfo(torch.float32).eps) - ) - # TODO: make sure abs(scales) is not too small? - zeros = torch.full_like(scales, 0) - return scales.to(precision).reshape(w.shape[0], -1), zeros.to( - precision - ).reshape(w.shape[0], -1) - - # Note: not using from torchao.quantization.quant_primitives because it will run into op registraion issues - def group_quantize_tensor_symmetric( - self, w, n_bit=4, group_size=128, precision=torch.float32 - ): - scales, zeros = self.get_group_qparams_symmetric( - w, n_bit, group_size, precision - ) - n_bit = 4 - max_int = 2 ** (n_bit - 1) - 1 - min_int = -(2 ** (n_bit - 1)) - # TODO: currently we don't know how to express torch.int4, we'll - # add torch.int4 to core later - w_int8 = torch.ops.quantized_decomposed.quantize_per_channel_group( - w, scales, zeros, min_int, max_int, torch.int8, group_size - ) - - return w_int8, scales, zeros - - def fwd_input_per_token(self, input: torch.Tensor) -> torch.Tensor: - ip_quant_min = -128 - ip_quant_max = 127 - ( - ip_scales, - ip_zero_points, - ) = torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric( - input, torch.int8 - ) - - input = torch.ops.quantized_decomposed.quantize_per_token( - input, - ip_scales, - ip_zero_points, - ip_quant_min, - ip_quant_max, - torch.int8, - ) - input = torch.ops.quantized_decomposed.dequantize_per_token( - input, - ip_scales, - ip_zero_points, - ip_quant_min, - ip_quant_max, - torch.int8, - self.op_dtype, - ) - return input - - def quant_weight_per_channel(self): - ( - self.w_scales.data, - self.w_zero_points.data, - ) = self.get_channel_qparams_symmetric( - self.w, n_bit=self.w_n_bit, precision=self.op_dtype - ) - self.w_q.data = torch.ops.quantized_decomposed.quantize_per_channel( - self.w, - self.w_scales, - self.w_zero_points, - axis=0, - quant_min=self.w_quant_min, - quant_max=self.w_quant_max, - dtype=torch.int8, - ) - - def quant_weight_per_channel_group(self): - self.w_q.data, w, zp = self.group_quantize_tensor_symmetric( - self.w, - n_bit=self.w_n_bit, - group_size=self.group_size, - ) - expected_min, expected_max = self.get_min_max(self.w_n_bit) - assert ( - torch.min(self.w_q.data) >= expected_min - ), "Found smaller than min element in quantized weight tensor" - assert ( - torch.max(self.w_q.data) <= expected_max - ), "Found larger than max element in quantized weight tensor" - assert ( - w.ndim == 2 and zp.ndim == 2 - ), f"Expecting 2d scales and zp tensors, but got {w.shape}, {zp.shape}" - self.w_scales.data, self.w_zero_points.data = w, zp - - def fwd_weight_per_channel(self) -> torch.Tensor: - # This is HACKY because the dequant will produce fp32 - return torch.ops.quantized_decomposed.dequantize_per_channel( - self.w_q, - self.w_scales, - self.w_zero_points, - axis=0, - quant_min=self.w_quant_min, - quant_max=self.w_quant_max, - dtype=torch.int8, # Regardless of w_n_bit, convert to 4b later - ) - - def fwd_weight_per_channel_group(self) -> torch.Tensor: - return torch.ops.quantized_decomposed.dequantize_per_channel_group( - self.w_q, - self.w_scales, - self.w_zero_points, - self.w_quant_min, - self.w_quant_max, - dtype=torch.int8, # Regardless of w_n_bit, convert to 4b later - group_size=self.group_size, - output_dtype=self.op_dtype, - ) - - def forward(self, input: torch.Tensor) -> torch.Tensor: - # Input - input = self.fwd_input_per_token(input) - - # Weights - w = ( - self.fwd_weight_per_channel_group() - if self.w_scales.ndim == 2 - else self.fwd_weight_per_channel() - ) - assert isinstance(w, torch.Tensor) - return torch.nn.functional.linear(input, w, self.bias) - - def _test_manual_dq_linear( + def _test_groupwise_dq_linear( self, mod: torch.nn.Module, inputs: Tuple[torch.Tensor], - weight_groupwise: bool = False, use_bias: bool = False, - atol: float = 1e-3, - rtol: float = 1e-3, + group_size: int = 8, + num_linears: int = 1, + atol: float = 5e-3, + rtol: float = 5e-3, ): - linear_edge_op = ( - "executorch_exir_dialects_edge__ops_aten_addmm_default" - if use_bias - else "executorch_exir_dialects_edge__ops_aten_mm_default" + quantize_(mod, int8_dynamic_activation_int4_weight(group_size=group_size)) + unwrap_tensor_subclass(mod) + DynamicallyQuantizedPartitioner = XnnpackPartitioner( + config_precisions=ConfigPrecisionType.DYNAMIC_QUANT, + per_op_mode=True, ) - - weight_dq_edge_op = ( - "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_channel_group_default" - if weight_groupwise - else "torch.ops.quantized_decomposed.dequantize_per_channel.default" - ) - - weight_dq_aten_op = ( - "torch.ops.quantized_decomposed.dequantize_per_channel_group.default" - if weight_groupwise - else "torch.ops.quantized_decomposed.dequantize_per_channel.default" + tester = ( + Tester(mod, inputs) + .export() + .check_count( + { + "torch.ops.quant.choose_qparams_affine.default": 1 * num_linears, + "torch.ops.quant.quantize_affine.default": 1 * num_linears, + "torch.ops.quant.dequantize_affine.default": 2 * num_linears, + "torch.ops.aten.linear.default": 1 * num_linears, + } + ) ) - for legacy_partitioner in (True, False): - tester = ( - Tester(mod, inputs) - .export() - .check_count( - { - "torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric.default": 1, - "torch.ops.quantized_decomposed.quantize_per_token.default": 1, - "torch.ops.quantized_decomposed.dequantize_per_token.default": 1, - weight_dq_aten_op: 1, - "torch.ops.aten.linear.default": 1, - } - ) + ( + tester.to_edge_transform_and_lower( + ToEdgeTransformAndLower([DynamicallyQuantizedPartitioner]) ) + ) - DynamicallyQuantizedPartitioner = XnnpackPartitioner( - config_precisions=ConfigPrecisionType.DYNAMIC_QUANT, - per_op_mode=True, + ( + tester.check_count( + { + "torch.ops.higher_order.executorch_call_delegate": 1, + } ) - if legacy_partitioner: - tester.to_edge() - tester.partition(Partition(DynamicallyQuantizedPartitioner)) - else: - ( - tester.to_edge_transform_and_lower( - ToEdgeTransformAndLower([DynamicallyQuantizedPartitioner]) - ) - ) - - ( - tester.check_count( - { - "torch.ops.higher_order.executorch_call_delegate": 1, - } - ) - .check_not( - [ - "executorch_exir_dialects_edge__ops_quantized_decomposed_choose_qparams_per_token_asymmetric_default", - "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_token_default", - "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_token_default", - weight_dq_edge_op, - linear_edge_op, - ] - ) - .to_executorch() - .serialize() - .run_method_and_compare_outputs(atol=atol, rtol=rtol) + .check_not( + [ + "executorch_exir_dialects_edge__ops_quant_choose_qparams_affine_default", + "executorch_exir_dialects_edge__ops_quant_quantize_affine_default", + "executorch_exir_dialects_edge__ops_quant_dequantize_affine_default", + "executorch_exir_dialects_edge__ops_aten_mm_default", + "executorch_exir_dialects_edge__ops_aten_addmm_default", + ] ) - - def _run_manual_dqlinear_tests(self, weight_n_bit: int, op_dtype: torch.dtype): - in_sizes = [1, 4, 4] - input_sizes = [4, 37, 17] - output_sizes = [4, 17, 37] - - for use_bias in [True, False]: - for i, _ in enumerate(in_sizes): - in_size = int(in_sizes[i]) - input_size = int(input_sizes[i]) - output_size = int(output_sizes[i]) - mod = self.ManualDQLinear( - input_channels=input_size, - output_channels=output_size, - weight_n_bit=weight_n_bit, - dtype=op_dtype, - use_bias=use_bias, - ) - - inputs = (torch.randn(1, in_size, input_size).to(op_dtype),) - self._test_manual_dq_linear(mod, inputs, use_bias=use_bias) + .to_executorch() + .serialize() + .run_method_and_compare_outputs(atol=atol, rtol=rtol) + ) diff --git a/backends/xnnpack/test/ops/lstm.py b/backends/xnnpack/test/ops/lstm.py new file mode 100644 index 00000000000..bfc6113c417 --- /dev/null +++ b/backends/xnnpack/test/ops/lstm.py @@ -0,0 +1,63 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner + +from executorch.backends.xnnpack.test.tester import Tester +from executorch.backends.xnnpack.test.tester.tester import ToEdgeTransformAndLower + + +class TestLSTM(unittest.TestCase): + class LSTMLinear(torch.nn.Module): + def __init__(self, input_size, hidden_size, out_size): + super().__init__() + self.lstm = torch.nn.LSTM( + input_size=input_size, hidden_size=hidden_size, batch_first=True + ) + self.linear = torch.nn.Linear(hidden_size, hidden_size) + self.linear2 = torch.nn.Linear(hidden_size, out_size) + + def forward(self, x): + x, hs = self.lstm(x) + x = self.linear(x[:, -1, :]) + x = self.linear2(x) + return torch.nn.functional.log_softmax(x, dim=1) + + def test_fp32_lstm(self): + ( + Tester(self.LSTMLinear(32, 32, 10), (torch.rand(1, 32, 32),)) + .export() + .to_edge_transform_and_lower() + .check_not(["executorch_exir_dialects_edge__ops_aten_addmm_default"]) + .check_not( + ["p_lstm_weight", "p_lstm_bias"] + ) # These Should be Consumed by Delegate + .to_executorch() + .serialize() + .run_method_and_compare_outputs() + ) + + def test_fp32_lstm_force_dynamic_linear(self): + ( + Tester(self.LSTMLinear(32, 32, 10), (torch.rand(1, 32, 32),)) + .export() + .to_edge_transform_and_lower( + ToEdgeTransformAndLower( + partitioners=[XnnpackPartitioner(force_fp32_dynamic_linear=True)] + ) + ) + .check_not(["executorch_exir_dialects_edge__ops_aten_addmm_default"]) + # Weights are supplied as input to linears + .check(["p_lstm_weight_hh_l0", "p_lstm_weight_ih_l0"]) + # Biases are owned by delegates + .check_not(["p_lstm_bias"]) + .to_executorch() + .serialize() + .run_method_and_compare_outputs() + ) diff --git a/backends/xnnpack/test/ops/mean_dim.py b/backends/xnnpack/test/ops/mean_dim.py index e39d3aee080..3bac5f3239c 100644 --- a/backends/xnnpack/test/ops/mean_dim.py +++ b/backends/xnnpack/test/ops/mean_dim.py @@ -56,6 +56,19 @@ def test_fp32_mean_dim_unsupported(self): .check_count({"executorch_exir_dialects_edge__ops_aten_mean_dim": 1}) ) + def test_fp32_mean_dim_unsupported_3d(self): + """ + XNNPack mean.dim implementation only supports 4D tensors. + """ + inputs = (torch.randn(1, 5, 4),) + ( + Tester(self.MeanDim((-1, -2)), inputs) + .export() + .check_count({"torch.ops.aten.mean.dim": 1}) + .to_edge_transform_and_lower() + .check_count({"executorch_exir_dialects_edge__ops_aten_mean_dim": 1}) + ) + def test_qs8_mean_dim(self): inputs = (torch.randn(1, 5, 4, 4),) ( diff --git a/backends/xnnpack/test/test_xnnpack_utils.py b/backends/xnnpack/test/test_xnnpack_utils.py index c6b1513d317..ea9217e04ab 100644 --- a/backends/xnnpack/test/test_xnnpack_utils.py +++ b/backends/xnnpack/test/test_xnnpack_utils.py @@ -25,6 +25,12 @@ # import the xnnpack backend implementation from executorch.backends.xnnpack.xnnpack_preprocess import XnnpackBackend +from executorch.devtools import BundledProgram + +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.serialize import ( + serialize_from_bundled_program_to_flatbuffer, +) from executorch.exir import ExecutorchProgram, ExirExportedProgram from executorch.exir.backend.backend_api import to_backend, validation_disabled @@ -34,12 +40,6 @@ _load_for_executorch_from_buffer, ) from executorch.extension.pytree import tree_flatten -from executorch.sdk import BundledProgram - -from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite -from executorch.sdk.bundled_program.serialize import ( - serialize_from_bundled_program_to_flatbuffer, -) from torch.ao.quantization import ( # @manual default_per_channel_symmetric_qnnpack_qconfig, @@ -72,6 +72,7 @@ get_symmetric_quantization_config, XNNPACKQuantizer, ) +from torch.export import export_for_training from torch.testing import FileCheck @@ -315,10 +316,11 @@ def quantize_and_test_model_with_quantizer( ): module.eval() # program capture - m = torch._export.capture_pre_autograd_graph( + + m = export_for_training( module, example_inputs, - ) + ).module() quantizer = XNNPACKQuantizer() quantization_config = get_symmetric_quantization_config() diff --git a/backends/xnnpack/test/tester/tester.py b/backends/xnnpack/test/tester/tester.py index 6fdf1615215..7586c4f2313 100644 --- a/backends/xnnpack/test/tester/tester.py +++ b/backends/xnnpack/test/tester/tester.py @@ -14,7 +14,6 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union import torch -import torch.export._trace as export_trace from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner from executorch.backends.xnnpack.passes import XNNPACKPassManager from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config @@ -31,6 +30,7 @@ from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass from executorch.exir.print_program import pretty_print, print_program +from torch.export import export_for_training logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -157,10 +157,10 @@ def __init__( def run( self, artifact: torch.nn.Module, inputs: Optional[Tuple[torch.Tensor]] ) -> None: - captured_graph = export_trace._export( - artifact, inputs, pre_dispatch=True - ).module() + assert inputs is not None + captured_graph = export_for_training(artifact, inputs).module() + assert isinstance(captured_graph, torch.fx.GraphModule) prepared = prepare_pt2e(captured_graph, self.quantizer) if self.calibrate: @@ -561,7 +561,8 @@ def to_edge(self, to_edge_stage: Optional[ToEdge] = None): if not to_edge_stage: to_edge_stage = ToEdge() to_edge_stage.edge_compile_conf._skip_dim_order = True - return self._run_stage(to_edge_stage) + res = self._run_stage(to_edge_stage) + return res def to_edge_transform_and_lower( self, to_edge_and_transform_stage: Optional[ToEdgeTransformAndLower] = None diff --git a/backends/xnnpack/third-party/XNNPACK b/backends/xnnpack/third-party/XNNPACK index 1d139a3b4b7..87ee0b46b83 160000 --- a/backends/xnnpack/third-party/XNNPACK +++ b/backends/xnnpack/third-party/XNNPACK @@ -1 +1 @@ -Subproject commit 1d139a3b4b7155889c88c31f370a82c48e7ca89c +Subproject commit 87ee0b46b834f67bad9025d4a82ed5654f3403d3 diff --git a/backends/xnnpack/third-party/cpuinfo b/backends/xnnpack/third-party/cpuinfo index d6860c477c9..16bfc1622c6 160000 --- a/backends/xnnpack/third-party/cpuinfo +++ b/backends/xnnpack/third-party/cpuinfo @@ -1 +1 @@ -Subproject commit d6860c477c99f1fce9e28eb206891af3c0e1a1d7 +Subproject commit 16bfc1622c6902d6f91d316ec54894910c620325 diff --git a/backends/xnnpack/third-party/generate-xnnpack-wrappers.py b/backends/xnnpack/third-party/generate-xnnpack-wrappers.py index bda79527178..e9b23e4a784 100644 --- a/backends/xnnpack/third-party/generate-xnnpack-wrappers.py +++ b/backends/xnnpack/third-party/generate-xnnpack-wrappers.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 from __future__ import print_function +from pathlib import Path import collections import os import sys @@ -36,8 +37,8 @@ "PROD_AVX512F_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", "PROD_AVX512SKX_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", "PROD_AVX512VBMI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", - "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", "PROD_AVX512VNNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", + "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", "PROD_RVV_MICROKERNEL_SRCS": "defined(__riscv) || defined(__riscv__)", "PROD_AVXVNNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)", "AARCH32_ASM_MICROKERNEL_SRCS": "defined(__arm__)", @@ -46,7 +47,7 @@ # add non-prod microkernel sources here: } -SRC_NAMES = set([ +SRC_NAMES = { "OPERATOR_SRCS", "SUBGRAPH_SRCS", "LOGGING_SRCS", @@ -81,30 +82,42 @@ "PROD_AVX512F_MICROKERNEL_SRCS", "PROD_AVX512SKX_MICROKERNEL_SRCS", "PROD_AVX512VBMI_MICROKERNEL_SRCS", - "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS", "PROD_AVX512VNNI_MICROKERNEL_SRCS", + "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS", "PROD_RVV_MICROKERNEL_SRCS", "PROD_AVXVNNI_MICROKERNEL_SRCS", "AARCH32_ASM_MICROKERNEL_SRCS", "AARCH64_ASM_MICROKERNEL_SRCS", # add non-prod microkernel sources here: -]) +} def handle_singleline_parse(line): start_index = line.find("(") end_index = line.find(")") line = line[start_index+1:end_index] key_val = line.split(" ") - return key_val[0], list(map(lambda x: x[4:], key_val[1:])) + return key_val[0], [x[4:] for x in key_val[1:]] def update_sources(xnnpack_path, cmakefile = "XNNPACK/CMakeLists.txt"): + print(f"Updating sources from {cmakefile}") sources = collections.defaultdict(list) with open(os.path.join(xnnpack_path, cmakefile)) as cmake: lines = cmake.readlines() i = 0 while i < len(lines): line = lines[i] + + if lines[i].startswith("INCLUDE"): + file, _ = handle_singleline_parse(line) + if file.startswith("cmake/gen/"): + path = Path(xnnpack_path) / "XNNPACK" / file + local_sources = update_sources(xnnpack_path, path.absolute().as_posix()) + for k,v in local_sources.items(): + if k in sources: + sources[k] = sources[k] + local_sources[k] + else: + sources[k] = local_sources[k] if lines[i].startswith("SET") and "src/" in lines[i]: name, val = handle_singleline_parse(line) @@ -132,7 +145,7 @@ def gen_wrappers(xnnpack_path): xnnpack_sources = collections.defaultdict(list) sources = update_sources(xnnpack_path) - microkernels_sources = update_sources(xnnpack_path, "XNNPACK/cmake/microkernels.cmake") + microkernels_sources = update_sources(xnnpack_path, "XNNPACK/cmake/gen/microkernels.cmake") for key in microkernels_sources: sources[key] = microkernels_sources[key] @@ -186,6 +199,8 @@ def gen_wrappers(xnnpack_path): def main(argv): + print("Generating wrappers...") + if argv is None or len(argv) == 0: gen_wrappers(".") else: diff --git a/backends/xnnpack/third-party/xnnpack.buck.bzl b/backends/xnnpack/third-party/xnnpack.buck.bzl index a1add446643..7f0a8ca6f21 100644 --- a/backends/xnnpack/third-party/xnnpack.buck.bzl +++ b/backends/xnnpack/third-party/xnnpack.buck.bzl @@ -1,7 +1,6 @@ load("//third-party:glob_defs.bzl", "subdir_glob") load( ":xnnpack_src_defs.bzl", - "JIT_SRCS", "LOGGING_SRCS", "OPERATOR_SRCS", "SUBGRAPH_SRCS", @@ -69,27 +68,6 @@ def define_xnnpack(): ], ) - # @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode. - native.cxx_library( - name = "jit_memory", - srcs = JIT_SRCS, - headers = subdir_glob([ - ("XNNPACK/src", "**/*.h"), - ]), - header_namespace = "", - compiler_flags = [ - "-std=c++17", - ], - preferred_linkage = "static", - preprocessor_flags = [ - "-DXNN_LOG_LEVEL=0", - ], - exported_deps = [ - ":clog", - ":interface", - ], - ) - # @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode. native.cxx_library( name = "operators", @@ -139,7 +117,6 @@ def define_xnnpack(): preferred_linkage = "static", preprocessor_flags = [ "-DXNN_LOG_LEVEL=0", - "-DXNN_ENABLE_JIT=0", "-DXNN_ENABLE_SPARSE=0", "-DXNN_ENABLE_GEMM_M_SPECIALIZATION=0", "-DXNN_ENABLE_MEMOPT", @@ -1223,7 +1200,6 @@ def define_xnnpack(): ] ARM_XNNPACK_DEPS = [ - ":jit_memory", ":ukernels_armsimd32", ":ukernels_fp16arith", ":ukernels_asm", @@ -1246,11 +1222,10 @@ def define_xnnpack(): "XNNPACK/src/configs/hardware-config.c", "XNNPACK/src/microparams-init.c", "XNNPACK/src/operator-run.c", - "XNNPACK/src/operators/post-operation.c", "XNNPACK/src/microkernel-utils.c", ], headers = subdir_glob([ - ("XNNPACK/src", "xnnpack/*.h"), + ("XNNPACK/src", "**/*.h"), ("XNNPACK/include", "**/*.h"), ]), exported_headers = { @@ -1271,7 +1246,6 @@ def define_xnnpack(): "-DXNN_NO_X8_OPERATORS", "-DXNN_ENABLE_MEMOPT", "-DXNN_ENABLE_SPARSE=0", - "-DXNN_ENABLE_JIT=0", "-DXNN_ENABLE_ASSEMBLY", "-DXNN_ENABLE_GEMM_M_SPECIALIZATION", "-DXNN_ENABLE_ARM_DOTPROD", diff --git a/backends/xnnpack/third-party/xnnpack_src_defs.bzl b/backends/xnnpack/third-party/xnnpack_src_defs.bzl index 0a0beba7efd..d8ebe7c72bb 100644 --- a/backends/xnnpack/third-party/xnnpack_src_defs.bzl +++ b/backends/xnnpack/third-party/xnnpack_src_defs.bzl @@ -200,7 +200,6 @@ PROD_F16C_MICROKERNEL_SRCS = [ ] PROD_XOP_MICROKERNEL_SRCS = [ - "XNNPACK/src/amalgam/gen/xop.c", ] PROD_AVX512F_MICROKERNEL_SRCS = [ @@ -493,30 +492,18 @@ AARCH64_ASM_MICROKERNEL_SRCS = [ "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S", "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S", "XNNPACK/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S", - "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S", "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S", "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S", "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S", "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S", "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S", - "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S", - "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S", - "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "XNNPACK/src/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", - "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S", "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S", "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S", "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S", "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S", "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S", - "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S", - "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S", - "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "XNNPACK/src/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", ] XNNPACK_SRCS = [ diff --git a/backends/xnnpack/third-party/xnnpack_wrapper_defs.bzl b/backends/xnnpack/third-party/xnnpack_wrapper_defs.bzl index 2dbb41ff01b..a9d4af95ccf 100644 --- a/backends/xnnpack/third-party/xnnpack_wrapper_defs.bzl +++ b/backends/xnnpack/third-party/xnnpack_wrapper_defs.bzl @@ -92,7 +92,6 @@ PROD_F16C_MICROKERNEL_SRCS = [ ] PROD_XOP_MICROKERNEL_SRCS = [ - "xnnpack_wrappers/amalgam/gen/xop.c", ] PROD_FMA3_MICROKERNEL_SRCS = [ @@ -447,28 +446,16 @@ AARCH64_ASM_MICROKERNEL_SRCS = [ "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S", "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld64.S", "xnnpack_wrappers/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S", - "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S", "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S", "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S", "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S", "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S", "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S", - "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S", - "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S", - "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "xnnpack_wrappers/qu8-gemm/gen/qu8-gemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", - "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x8c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53-prfm.S", "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a53.S", "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75-prfm.S", "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-cortex-a75.S", "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64-prfm.S", "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16-minmax-rndnu-asm-aarch64-neon-mlal-lane-ld64.S", - "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-cortex-a55.S", - "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-fp32-asm-aarch64-neondot-ld128.S", - "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-cortex-a55.S", - "xnnpack_wrappers/qu8-igemm/gen/qu8-igemm-4x16c4-minmax-rndnu-asm-aarch64-neondot-ld128.S", ] diff --git a/backends/xnnpack/utils/quant_utils.py b/backends/xnnpack/utils/quant_utils.py index d5a7ec7fd0d..7c035757a6f 100644 --- a/backends/xnnpack/utils/quant_utils.py +++ b/backends/xnnpack/utils/quant_utils.py @@ -4,6 +4,10 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import operator +from itertools import accumulate +from typing import cast + import torch from executorch.exir.backend.canonical_partitioners.config_partitioner import ( format_target_name, @@ -15,6 +19,7 @@ "quantize_per_channel.default", "quantize_per_channel_group.default", "quantize_per_token.default", + "quantize_affine.default", } _DQ_OPS = { @@ -23,12 +28,14 @@ "dequantize_per_channel.default", "dequantize_per_channel_group.default", "dequantize_per_token.default", + "dequantize_affine.default", } _QPARAM_OPS = { "choose_qparams.tensor", "choose_qparams_per_token_asymmetric.default", + "choose_qparams_affine.default", } _DYNAMIC_OPS = { @@ -43,8 +50,9 @@ def is_dynamic_qdq(node: torch.fx.Node) -> bool: if node.op != "call_function": return False node_name = format_target_name(node.target.__name__) # pyre-ignore + is_dynamic_affine = is_per_token(node) and not is_per_channel_group(node) - return node_name in _DYNAMIC_OPS + return node_name in _DYNAMIC_OPS or is_dynamic_affine def is_qparam(node: torch.fx.Node) -> bool: @@ -75,4 +83,106 @@ def is_per_channel(node: torch.fx.Node) -> bool: if not (is_quant(node) or is_dequant(node)): return False - return "per_channel" in node.target.__name__ # pyre-ignore + is_affine_per_channel_group = is_per_channel_group(node) + is_per_channel = "per_channel" in node.target.__name__ # pyre-ignore + + return is_per_channel or is_affine_per_channel_group + + +def is_affine_qdq(node: torch.fx.Node) -> bool: + if not (is_quant(node) or is_dequant(node)): + return False + + return "quantize_affine" in node.target.__name__ # pyre-ignore + + +def _get_block_size_input_scale(node: torch.fx.Node): + assert is_affine_qdq(node) + block_size = node.args[1] + input_val = node.all_input_nodes[0].meta["val"] + scale_val = node.all_input_nodes[1].meta["val"] + return block_size, input_val, scale_val + + +def is_per_token(node: torch.fx.Node): + if not (is_quant(node) or is_dequant(node)): + return False + + if "per_token" in node.target.__name__: # pyre-ignore + return True + elif is_affine_qdq(node): + block_size, input_val, scale_val = _get_block_size_input_scale(node) + flag = True + scale_numel_expected = 1 + for i in range(len(block_size) - 1): + flag &= block_size[i] == 1 + scale_numel_expected *= input_val.shape[i] + + flag &= block_size[-1] == input_val.shape[-1] + flag &= scale_val.numel() == scale_numel_expected + return flag + + return False + + +def is_per_channel_group(node: torch.fx.Node): + if not (is_quant(node) or is_dequant(node)): + return False + + if "per_channel_group" in node.target.__name__: # pyre-ignore + return True + elif is_affine_qdq(node): + block_size, input_val, scale_val = _get_block_size_input_scale(node) + flag = True + flag &= len(block_size) == 2 + flag &= block_size[0] == 1 + group_size = block_size[1] + scale_numel = list(accumulate(scale_val.shape, operator.mul))[-1] + input_numel = list(accumulate(input_val.shape, operator.mul))[-1] + flag &= input_numel == group_size * scale_numel + return flag + + return False + + +def extract_qdq_affine_op_args_for_decomposed_ops(node: torch.fx.Node): + if not is_affine_qdq(node): + return None, None + # make sure input_dtype and zero_point_domain have expected values + input_node = node.args[0] + scale_node = node.args[2] + zero_point_node = node.args[3] + args = [input_node, scale_node, zero_point_node] + assert ( + len(node.args) > 4 + ), f"expecting at least 6 args, got node: {node.format_node()}" + + if node.args[4] != torch.int8: + return None, None + target_dtype = cast(torch.dtype, node.args[4]) + + if len(node.args) > 6: + # quant_min + args.append(node.args[5]) + # quant_max + args.append(node.args[6]) + else: + dtype_info = torch.iinfo(target_dtype) + quant_min = dtype_info.min + quant_max = dtype_info.max + args.append(quant_min) + args.append(quant_max) + + # add target_dtype_node after quant_min/quant_max + args.append(target_dtype) + # zero_point_domain + if len(node.args) > 7 and node.args[7] != "INT": + return None, None + + if is_per_channel_group(node): + block_sizes = cast(list[int], node.args[1]) + args.append(block_sizes[-1]) + + args.append(node.args[-1]) + + return args diff --git a/build/Codegen.cmake b/build/Codegen.cmake index 1c309cf3bce..381cd0958fd 100644 --- a/build/Codegen.cmake +++ b/build/Codegen.cmake @@ -78,7 +78,8 @@ function(generate_bindings_for_kernels) # Executorch runtime. execute_process( COMMAND - "${PYTHON_EXECUTABLE}" -c "from distutils.sysconfig import get_python_lib;print(get_python_lib())" + "${PYTHON_EXECUTABLE}" -c + "from distutils.sysconfig import get_python_lib;print(get_python_lib())" OUTPUT_VARIABLE site-packages-out ERROR_VARIABLE site-packages-out-error RESULT_VARIABLE site-packages-result @@ -150,9 +151,8 @@ function(gen_custom_ops_aot_lib) include(${EXECUTORCH_ROOT}/build/Utils.cmake) target_link_options_shared_lib(${GEN_LIB_NAME}) - if(EXECUTORCH_BUILD_PYBIND AND APPLE) - target_link_libraries(${GEN_LIB_NAME} PRIVATE executorch_no_prim_ops) - target_link_options(${GEN_LIB_NAME} PRIVATE -undefined dynamic_lookup) + if(TARGET portable_lib) + target_link_libraries(${GEN_LIB_NAME} PRIVATE portable_lib) else() target_link_libraries(${GEN_LIB_NAME} PRIVATE executorch_no_prim_ops) endif() diff --git a/build/Test.cmake b/build/Test.cmake index b2b23cb03ad..d6ef124793c 100644 --- a/build/Test.cmake +++ b/build/Test.cmake @@ -5,8 +5,8 @@ # LICENSE file in the root directory of this source tree. # -# This file is intended to have helper functions for test-related -# CMakeLists.txt files. +# This file is intended to have helper functions for test-related CMakeLists.txt +# files. # # ### Editing this file ### # @@ -25,61 +25,66 @@ find_package(executorch CONFIG REQUIRED) enable_testing() find_package(GTest CONFIG REQUIRED) +target_link_options_shared_lib(cpuinfo) target_link_options_shared_lib(extension_data_loader) target_link_options_shared_lib(portable_kernels) target_link_options_shared_lib(portable_ops_lib) +target_link_options_shared_lib(pthreadpool) target_link_options_shared_lib(quantized_ops_lib) # Add code coverage flags to supported compilers if(EXECUTORCH_USE_CPP_CODE_COVERAGE) if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") - string(APPEND CMAKE_C_FLAGS " --coverage -fprofile-abs-path") - string(APPEND CMAKE_CXX_FLAGS " --coverage -fprofile-abs-path") + string(APPEND CMAKE_C_FLAGS " --coverage -fprofile-abs-path") + string(APPEND CMAKE_CXX_FLAGS " --coverage -fprofile-abs-path") elseif("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") - string(APPEND CMAKE_C_FLAGS " -fprofile-instr-generate -fcoverage-mapping") - string(APPEND CMAKE_CXX_FLAGS " -fprofile-instr-generate -fcoverage-mapping") + string(APPEND CMAKE_C_FLAGS " -fprofile-instr-generate -fcoverage-mapping") + string(APPEND CMAKE_CXX_FLAGS + " -fprofile-instr-generate -fcoverage-mapping" + ) else() - message(ERROR "Code coverage for compiler ${CMAKE_CXX_COMPILER_ID} is unsupported") + message(ERROR + "Code coverage for compiler ${CMAKE_CXX_COMPILER_ID} is unsupported" + ) endif() endif() -# A helper function to generate a gtest cxx executable target -# @param target_name: name for the executable -# @param SOURCES : test sources to be compiled. Sometimes -# util sources are used as well -# @param EXTRA LIBS : additional libraries to be linked against -# the target. gtest, gmock, executorch are linked by default, but Sometimes -# user may need additional libraries like kernels. -# We use CMake package executorch in this helper, so user can easily add -# installed libraries. +# A helper function to generate a gtest cxx executable target @param +# target_name: name for the executable @param SOURCES : test +# sources to be compiled. Sometimes util sources are used as well @param EXTRA +# LIBS : additional libraries to be linked against the target. +# gtest, gmock, executorch are linked by default, but Sometimes user may need +# additional libraries like kernels. We use CMake package executorch in this +# helper, so user can easily add installed libraries. # -# Example: -# et_cxx_test(my_test SOURCES my_test.cpp EXTRA_LIBS portable_kernels) +# Example: et_cxx_test(my_test SOURCES my_test.cpp EXTRA_LIBS portable_kernels) # # This defines a gtest executable my_test, compiling my_test.cpp, and linking # against libportable_kernels.a. # function(et_cxx_test target_name) -set(multi_arg_names SOURCES EXTRA_LIBS) -cmake_parse_arguments(ET_CXX_TEST "" "" "${multi_arg_names}" ${ARGN}) + set(multi_arg_names SOURCES EXTRA_LIBS) + cmake_parse_arguments(ET_CXX_TEST "" "" "${multi_arg_names}" ${ARGN}) -# Let files say "include ". -target_include_directories(executorch INTERFACE ${EXECUTORCH_ROOT}/..) + # Let files say "include ". + target_include_directories(executorch INTERFACE ${EXECUTORCH_ROOT}/..) -set(ET_TEST_UTIL_SOURCES ${EXECUTORCH_ROOT}/runtime/core/exec_aten/testing_util/tensor_util.cpp) + set(ET_TEST_UTIL_SOURCES + ${EXECUTORCH_ROOT}/runtime/core/exec_aten/testing_util/tensor_util.cpp + ) -add_executable(${target_name} ${ET_CXX_TEST_SOURCES} ${ET_TEST_UTIL_SOURCES}) -# Includes gtest, gmock, executorch by default -target_link_libraries( - ${target_name} GTest::gtest GTest::gtest_main GTest::gmock executorch - ${ET_CXX_TEST_EXTRA_LIBS} -) + add_executable(${target_name} ${ET_CXX_TEST_SOURCES} ${ET_TEST_UTIL_SOURCES}) + # Includes gtest, gmock, executorch by default + target_link_libraries( + ${target_name} GTest::gtest GTest::gtest_main GTest::gmock executorch + ${ET_CXX_TEST_EXTRA_LIBS} + ) -# add_test adds a test target to be used by ctest. -# We use `ExecuTorchTest` as the ctest target name for the test executable -# Usage: cd cmake-out/path/to/test/; ctest -# Note: currently we directly invoke the test target, without using ctest -add_test(ExecuTorchTest ${target_name}) + # add_test adds a test target to be used by ctest. We use `ExecuTorchTest` as + # the ctest target name for the test executable Usage: cd + # cmake-out/path/to/test/; ctest Note: currently we directly invoke the test + # target, without using ctest + add_test(ExecuTorchTest ${target_name}) endfunction() diff --git a/build/Utils.cmake b/build/Utils.cmake index 56fc1e104b0..3ea616d5900 100644 --- a/build/Utils.cmake +++ b/build/Utils.cmake @@ -65,6 +65,12 @@ function(executorch_print_configuration_summary) message(STATUS " EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL : " "${EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL}" ) + message(STATUS " EXECUTORCH_BUILD_EXTENSION_TENSOR : " + "${EXECUTORCH_BUILD_EXTENSION_TENSOR}" + ) + message(STATUS " EXECUTORCH_BUILD_EXTENSION_TRAINING : " + "${EXECUTORCH_BUILD_EXTENSION_TRAINING}" + ) message( STATUS " EXECUTORCH_BUILD_FLATC : ${EXECUTORCH_BUILD_FLATC}" @@ -97,7 +103,7 @@ function(executorch_print_configuration_summary) "${EXECUTORCH_BUILD_KERNELS_QUANTIZED}" ) message( - STATUS " EXECUTORCH_BUILD_SDK : ${EXECUTORCH_BUILD_SDK}" + STATUS " EXECUTORCH_BUILD_DEVTOOLS : ${EXECUTORCH_BUILD_DEVTOOLS}" ) message( STATUS @@ -143,11 +149,21 @@ function(macos_kernel_link_options target_name) ) endfunction() +# Same as kernel_link_options but it's for MSVC linker +function(msvc_kernel_link_options target_name) + target_link_options( + ${target_name} INTERFACE + "SHELL:LINKER:/WHOLEARCHIVE:$" + ) +endfunction() + # Ensure that the load-time constructor functions run. By default, the linker # would remove them since there are no other references to them. function(target_link_options_shared_lib target_name) if(APPLE) macos_kernel_link_options(${target_name}) + elseif(MSVC) + msvc_kernel_link_options(${target_name}) else() kernel_link_options(${target_name}) endif() @@ -171,11 +187,20 @@ function(extract_sources sources_file) set(executorch_root ${CMAKE_CURRENT_SOURCE_DIR}) endif() + if(ANDROID_ABI) + if("${ANDROID_ABI}" STREQUAL "arm64-v8a") + set(target_platforms_arg "--target-platforms=shim//:android-arm64") + elseif("${ANDROID_ABI}" STREQUAL "x86_64") + set(target_platforms_arg "--target-platforms=shim//:android-x86_64") + else() + message(FATAL_ERROR "Unsupported ANDROID_ABI setting ${ANDROID_ABI}. Please add it here!") + endif() + endif() execute_process( COMMAND ${PYTHON_EXECUTABLE} ${executorch_root}/build/extract_sources.py --config=${executorch_root}/build/cmake_deps.toml --out=${sources_file} - --buck2=${BUCK2} + --buck2=${BUCK2} ${target_platforms_arg} OUTPUT_VARIABLE gen_srcs_output ERROR_VARIABLE gen_srcs_error RESULT_VARIABLE gen_srcs_exit_code diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh index a11e54f932d..42034c254f4 100644 --- a/build/build_android_llm_demo.sh +++ b/build/build_android_llm_demo.sh @@ -17,15 +17,16 @@ build_jar() { build_android_native_library() { ANDROID_ABI="$1" - TOKENIZER="$2" ANDROID_NDK="${ANDROID_NDK:-/opt/ndk}" CMAKE_OUT="cmake-out-android-${ANDROID_ABI}" - if [[ $TOKENIZER == "tiktoken" ]]; then - EXECUTORCH_USE_TIKTOKEN=ON + QNN_SDK_ROOT="${QNN_SDK_ROOT:-}" + if [ -n "$QNN_SDK_ROOT" ]; then + EXECUTORCH_BUILD_QNN=ON else - EXECUTORCH_USE_TIKTOKEN=OFF + EXECUTORCH_BUILD_QNN=OFF fi + cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \ -DANDROID_ABI="${ANDROID_ABI}" \ @@ -36,9 +37,13 @@ build_android_native_library() { -DEXECUTORCH_XNNPACK_SHARED_WORKSPACE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -DEXECUTORCH_BUILD_QNN="${EXECUTORCH_BUILD_QNN}" \ + -DQNN_SDK_ROOT="${QNN_SDK_ROOT}" \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}" @@ -49,21 +54,6 @@ build_android_native_library() { fi cmake --build "${CMAKE_OUT}" -j "${CMAKE_JOBS}" --target install --config Release - cmake examples/models/llama2 \ - -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ - -DANDROID_ABI="$ANDROID_ABI" \ - -DANDROID_PLATFORM=android-23 \ - -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ - -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \ - -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ - -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DCMAKE_BUILD_TYPE=Release \ - -B"${CMAKE_OUT}"/examples/models/llama2 - - cmake --build "${CMAKE_OUT}"/examples/models/llama2 -j "${CMAKE_JOBS}" --config Release - - cmake extension/android \ -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \ -DANDROID_ABI="${ANDROID_ABI}" \ @@ -71,8 +61,8 @@ build_android_native_library() { -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_ENABLE_LOGGING=ON \ -DEXECUTORCH_LOG_LEVEL=Info \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_BUILD_LLAMA_JNI=ON \ - -DEXECUTORCH_USE_TIKTOKEN="${EXECUTORCH_USE_TIKTOKEN}" \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}"/extension/android @@ -81,6 +71,19 @@ build_android_native_library() { # Copy artifacts to ABI specific directory mkdir -p "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}" cp "${CMAKE_OUT}"/extension/android/*.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + + # Copy QNN related so library + if [ -n "$QNN_SDK_ROOT" ] && [ "$ANDROID_ABI" == "arm64-v8a" ]; then + cp "${CMAKE_OUT}"/lib/libqnn_executorch_backend.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtp.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnSystem.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV69Stub.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV73Stub.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV75Stub.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + cp "${QNN_SDK_ROOT}"/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + cp "${QNN_SDK_ROOT}"/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + cp "${QNN_SDK_ROOT}"/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so "${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/" + fi } build_aar() { @@ -93,23 +96,28 @@ build_aar() { # between Java and JNI find jni -type f -name "libexecutorch_jni.so" -exec bash -c 'mv "$1" "${1/_jni/}"' bash {} \; # Zip all necessary files into the AAR file - zip -r executorch.aar libs jni/*/libexecutorch.so AndroidManifest.xml - zip -r executorch-llama.aar libs jni/*/libexecutorch_llama_jni.so AndroidManifest.xml + zip -r executorch.aar libs jni/*/libexecutorch.so jni/*/libqnn*.so jni/*/libQnn*.so AndroidManifest.xml + cp executorch.aar executorch-llama.aar popd } -build_android_llm_demo_app() { +build_android_demo_apps() { mkdir -p examples/demo-apps/android/LlamaDemo/app/libs cp ${BUILD_AAR_DIR}/executorch-llama.aar examples/demo-apps/android/LlamaDemo/app/libs pushd examples/demo-apps/android/LlamaDemo ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew build assembleAndroidTest popd + + mkdir -p extension/android/benchmark/app/libs + cp ${BUILD_AAR_DIR}/executorch.aar extension/android/benchmark/app/libs + pushd extension/android/benchmark + ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew build assembleAndroidTest + popd } collect_artifacts_to_be_uploaded() { - TOKENIZER="$1" - ARTIFACTS_DIR_NAME="$2" - DEMO_APP_DIR="${ARTIFACTS_DIR_NAME}/llm_demo_${TOKENIZER}" + ARTIFACTS_DIR_NAME="$1" + DEMO_APP_DIR="${ARTIFACTS_DIR_NAME}/llm_demo" # The app directory is named using its build flavor as a suffix. mkdir -p "${DEMO_APP_DIR}" # Collect the app and its test suite @@ -124,20 +132,26 @@ collect_artifacts_to_be_uploaded() { # Collect JAR and AAR cp extension/android/build/libs/executorch.jar "${DEMO_APP_DIR}" find "${BUILD_AAR_DIR}/" -name 'executorch*.aar' -exec cp {} "${DEMO_APP_DIR}" \; + # Collect MiniBench APK + MINIBENCH_APP_DIR="${ARTIFACTS_DIR_NAME}/minibench" + mkdir -p "${MINIBENCH_APP_DIR}" + cp extension/android/benchmark/app/build/outputs/apk/debug/*.apk "${MINIBENCH_APP_DIR}" + cp extension/android/benchmark/app/build/outputs/apk/androidTest/debug/*.apk "${MINIBENCH_APP_DIR}" } BUILD_AAR_DIR="$(mktemp -d)" export BUILD_AAR_DIR -ANDROID_ABIS=("arm64-v8a" "x86_64") +if [ -z "$ANDROID_ABIS" ]; then + ANDROID_ABIS=("arm64-v8a" "x86_64") +fi export ANDROID_ABIS -TOKENIZER="${1:-bpe}" -ARTIFACTS_DIR_NAME="$2" +ARTIFACTS_DIR_NAME="$1" build_jar for ANDROID_ABI in "${ANDROID_ABIS[@]}"; do - build_android_native_library ${ANDROID_ABI} ${TOKENIZER} + build_android_native_library ${ANDROID_ABI} done build_aar -build_android_llm_demo_app -collect_artifacts_to_be_uploaded ${TOKENIZER} ${ARTIFACTS_DIR_NAME} +build_android_demo_apps +collect_artifacts_to_be_uploaded ${ARTIFACTS_DIR_NAME} diff --git a/build/build_apple_frameworks.sh b/build/build_apple_frameworks.sh index a22fd4ecb9d..6e3b8c0c5ed 100755 --- a/build/build_apple_frameworks.sh +++ b/build/build_apple_frameworks.sh @@ -33,6 +33,7 @@ libexecutorch_no_prim_ops.a,\ libextension_apple.a,\ libextension_data_loader.a,\ libextension_module.a,\ +libextension_tensor.a,\ :$HEADERS_PATH" FRAMEWORK_BACKEND_COREML="backend_coreml:\ @@ -56,7 +57,7 @@ libcustom_ops.a,\ FRAMEWORK_KERNELS_OPTIMIZED="kernels_optimized:\ liboptimized_kernels.a,\ -liboptimized_ops_lib.a,\ +liboptimized_native_cpu_ops_lib.a,\ :" FRAMEWORK_KERNELS_PORTABLE="kernels_portable:\ @@ -162,9 +163,11 @@ cmake_build() { -DEXECUTORCH_BUILD_COREML=$COREML \ -DEXECUTORCH_BUILD_MPS=$MPS \ -DEXECUTORCH_BUILD_XNNPACK=$XNNPACK \ + -DEXECUTORCH_XNNPACK_SHARED_WORKSPACE=ON \ -DEXECUTORCH_BUILD_EXTENSION_APPLE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=$CUSTOM \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=$OPTIMIZED \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=$QUANTIZED \ @@ -188,6 +191,7 @@ mkdir -p "$HEADERS_PATH" "$SOURCE_ROOT_DIR"/build/print_exported_headers.py --buck2="$BUCK2" --targets \ //extension/module: \ + //extension/tensor: \ | rsync -av --files-from=- "$SOURCE_ROOT_DIR" "$HEADERS_PATH/executorch" cp "$SOURCE_ROOT_DIR/extension/apple/ExecuTorch/Exported/"*.h "$HEADERS_PATH/executorch" diff --git a/build/build_apple_llm_demo.sh b/build/build_apple_llm_demo.sh new file mode 100755 index 00000000000..9fe1c1bcd77 --- /dev/null +++ b/build/build_apple_llm_demo.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -euo pipefail + +ARTIFACTS_DIR_NAME="$1" +APP_PATH="extension/apple/Benchmark/Benchmark" + +xcodebuild build-for-testing \ + -project "${APP_PATH}.xcodeproj" \ + -scheme Benchmark \ + -destination "platform=iOS" \ + -sdk iphoneos \ + -allowProvisioningUpdates \ + DEVELOPMENT_TEAM=78E7V7QP35 \ + CODE_SIGN_STYLE=Manual \ + PROVISIONING_PROFILE_SPECIFIER="ExecuTorch Benchmark" \ + CODE_SIGN_IDENTITY="iPhone Distribution" \ + CODE_SIGNING_REQUIRED=No \ + CODE_SIGNING_ALLOWED=No + +# The hack to figure out where the xctest package locates +BUILD_DIR=$(xcodebuild -showBuildSettings -project "$APP_PATH.xcodeproj" -json | jq -r ".[0].buildSettings.BUILD_DIR") + +# Prepare the demo app, debug mode here is the default from xcodebuild and match +# with what we have in the test spec +MODE="Release" +PLATFORM="iphoneos" +pushd "${BUILD_DIR}/${MODE}-${PLATFORM}" + +rm -rf Payload && mkdir Payload +APP_NAME=Benchmark + +ls -lah +cp -r "${APP_NAME}.app" Payload && zip -vr "${APP_NAME}.ipa" Payload + +popd + +# Prepare the test suite +pushd "${BUILD_DIR}" + +ls -lah +zip -vr "${APP_NAME}.xctestrun.zip" *.xctestrun + +popd + +if [[ -n "${ARTIFACTS_DIR_NAME}" ]]; then + mkdir -p "${ARTIFACTS_DIR_NAME}" + # Prepare all the artifacts to upload + cp "${BUILD_DIR}/${MODE}-${PLATFORM}/${APP_NAME}.ipa" "${ARTIFACTS_DIR_NAME}/" + cp "${BUILD_DIR}/${APP_NAME}.xctestrun.zip" "${ARTIFACTS_DIR_NAME}/" + + ls -lah "${ARTIFACTS_DIR_NAME}/" +fi diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml index a051dad027d..c0011f175ea 100644 --- a/build/cmake_deps.toml +++ b/build/cmake_deps.toml @@ -73,6 +73,7 @@ excludes = [ deps = [ "executorch", "executorch_no_prim_ops", + "extension_threadpool", "portable_kernels", ] @@ -116,6 +117,20 @@ deps = [ "executorch", ] +[targets.optimized_native_cpu_ops_oss] +buck_targets = [ + "//configurations:optimized_native_cpu_ops_oss", +] +filters = [ + ".cpp$", +] +excludes = [ +] +deps = [ + "executorch_no_prim_ops", + "executorch", + "portable_kernels", +] # ---------------------------------- core end ---------------------------------- # ---------------------------------- extension start ---------------------------------- [targets.extension_data_loader] @@ -171,6 +186,58 @@ deps = [ "extension_module", "extension_runner_util", ] + +[targets.extension_tensor] +buck_targets = [ + "//extension/tensor:tensor", +] +filters = [ + ".cpp$", +] +deps = [ + "executorch", + "executorch_no_prim_ops", +] + +[targets.extension_threadpool] +buck_targets = [ + "//extension/threadpool:threadpool", +] +filters = [ + ".cpp$", +] +deps = [ + "executorch", + "executorch_no_prim_ops", +] + +[targets.extension_training] +buck_targets = [ + "//extension/training/module:training_module", + "//extension/training/optimizer:sgd", +] +filters = [ + ".cpp$", +] +deps = [ + "executorch_no_prim_ops", +] + +[targets.train_xor] +buck_targets = [ + "//extension/training/examples/XOR:train_xor", +] +filters = [ + ".cpp$", +] +excludes = [ + "^codegen", +] +deps = [ + "executorch", + "executorch_no_prim_ops", + "portable_kernels", +] # ---------------------------------- extension end ---------------------------------- # ---------------------------------- binary start ---------------------------------- @@ -298,7 +365,10 @@ buck_targets = [ "//extension/llm/custom_ops:custom_ops", ] filters = [ - ".cpp$", + # Second clause is to pick up fht_neon.c/fht_avx.c from FFHT. TODO: + # remove filters and patch extract_sources.py's Buck query to fetch + # srcs; presumably filters is here to remove .h files. + "(.cpp$)|(fht.*\\.c$)", ] excludes = [ "^codegen", @@ -307,6 +377,7 @@ deps = [ "executorch", "executorch_no_prim_ops", "optimized_kernels", + "extension_threadpool", "xnnpack_backend", ] @@ -329,5 +400,6 @@ deps = [ "portable_kernels", "quantized_kernels", "xnnpack_backend", + "optimized_native_cpu_ops_oss", ] # ---------------------------------- LLama end ---------------------------------- diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake index 962990d7c82..18b6c7801b9 100644 --- a/build/executorch-config.cmake +++ b/build/executorch-config.cmake @@ -46,6 +46,9 @@ set(lib_list extension_module extension_module_static extension_runner_util + extension_tensor + extension_threadpool + extension_training xnnpack_backend XNNPACK cpuinfo diff --git a/build/extract_sources.py b/build/extract_sources.py index ce8b3de9812..5004fe0c508 100755 --- a/build/extract_sources.py +++ b/build/extract_sources.py @@ -11,7 +11,7 @@ import re from enum import Enum -from typing import Any, Optional, Sequence +from typing import Any, List, Optional, Sequence from buck_util import Buck2Runner @@ -96,7 +96,12 @@ def __init__( else: self._config[k] = v - def get_sources(self, graph: "Graph", runner: Buck2Runner) -> frozenset[str]: + def get_sources( + self, graph: "Graph", runner: Buck2Runner, buck_args: Optional[List[str]] + ) -> frozenset[str]: + if buck_args is None: + buck_args = [] + if self._state == Target._InitState.READY: return self._sources # Detect cycles. @@ -113,7 +118,7 @@ def get_sources(self, graph: "Graph", runner: Buck2Runner) -> frozenset[str]: ) # Get the complete list of source files that this target depends on. - sources: set[str] = set(runner.run(["cquery", query])) + sources: set[str] = set(runner.run(["cquery", query] + buck_args)) # Keep entries that match all of the filters. filters = [re.compile(p) for p in self._config.get("filters", [])] @@ -128,7 +133,9 @@ def get_sources(self, graph: "Graph", runner: Buck2Runner) -> frozenset[str]: # its deps. Remove entries that are already covered by the transitive # set of dependencies. for dep in self._config.get("deps", []): - sources.difference_update(graph.by_name[dep].get_sources(graph, runner)) + sources.difference_update( + graph.by_name[dep].get_sources(graph, runner, buck_args) + ) self._sources = frozenset(sources) self._state = Target._InitState.READY @@ -173,6 +180,9 @@ def parse_args() -> argparse.Namespace: metavar="file", help="Path to the file to generate.", ) + parser.add_argument( + "--target-platforms", help="--target-platforms to pass to buck cquery, if any." + ) return parser.parse_args() @@ -199,8 +209,12 @@ def main(): # Run the queries and get the lists of source files. target_to_srcs: dict[str, list[str]] = {} runner: Buck2Runner = Buck2Runner(args.buck2) + buck_args = [] + if args.target_platforms: + buck_args = ["--target-platforms"] + buck_args.append(args.target_platforms) for name, target in graph.by_name.items(): - target_to_srcs[name] = sorted(target.get_sources(graph, runner)) + target_to_srcs[name] = sorted(target.get_sources(graph, runner, buck_args)) # Generate the requested format. output: bytes diff --git a/build/pip_data_bin_init.py.in b/build/pip_data_bin_init.py.in index 9644c5621df..0c9d60e0498 100644 --- a/build/pip_data_bin_init.py.in +++ b/build/pip_data_bin_init.py.in @@ -21,7 +21,9 @@ def _find_executable_files_under(dir): for filename in os.listdir(dir): filepath = os.path.join(dir, filename) if os.path.isfile(filepath) and os.access(filepath, os.X_OK): - bin_names.append(filename) + # Remove .exe suffix on windows. + filename_without_ext = os.path.splitext(filename)[0] + bin_names.append(filename_without_ext) return bin_names # The list of binaries to create wrapper functions for. diff --git a/build/test_ios_ci.sh b/build/test_ios_ci.sh index 5fa6ef7d246..50c6448d4b2 100755 --- a/build/test_ios_ci.sh +++ b/build/test_ios_ci.sh @@ -11,6 +11,9 @@ APP_PATH="examples/demo-apps/apple_ios/ExecuTorchDemo/ExecuTorchDemo" MODEL_NAME="mv3" SIMULATOR_NAME="executorch" +# If this is set, copy the build artifacts to this directory +ARTIFACTS_DIR_NAME="$1" + finish() { EXIT_STATUS=$? if xcrun simctl list | grep -q "$SIMULATOR_NAME"; then @@ -64,3 +67,49 @@ xcodebuild test \ -project "$APP_PATH.xcodeproj" \ -scheme MobileNetClassifierTest \ -destination name="$SIMULATOR_NAME" + +# NB: https://docs.aws.amazon.com/devicefarm/latest/developerguide/test-types-ios-xctest-ui.html +say "Package The Test Suite" + +xcodebuild build-for-testing \ + -project "$APP_PATH.xcodeproj" \ + -scheme MobileNetClassifierTest \ + -destination platform="iOS" \ + -allowProvisioningUpdates \ + DEVELOPMENT_TEAM=78E7V7QP35 \ + CODE_SIGN_STYLE=Manual \ + PROVISIONING_PROFILE_SPECIFIER=ExecuTorchDemo \ + CODE_SIGN_IDENTITY="iPhone Distribution" + +# The hack to figure out where the xctest package locates +BUILD_DIR=$(xcodebuild -showBuildSettings -project "$APP_PATH.xcodeproj" -json | jq -r ".[0].buildSettings.BUILD_DIR") + +# Prepare the demo app +MODE="Debug" +PLATFORM="iphoneos" +pushd "${BUILD_DIR}/${MODE}-${PLATFORM}" + +rm -rf Payload && mkdir Payload +MOCK_APP_NAME=ExecuTorchDemo + +ls -lah +cp -r "${MOCK_APP_NAME}.app" Payload && zip -vr "${MOCK_APP_NAME}.ipa" Payload + +popd + +# Prepare the test suite +pushd "${BUILD_DIR}" + +ls -lah +zip -vr "${MOCK_APP_NAME}.xctestrun.zip" *.xctestrun + +popd + +if [[ -n "${ARTIFACTS_DIR_NAME}" ]]; then + mkdir -p "${ARTIFACTS_DIR_NAME}" + # Prepare all the artifacts to upload + cp "${BUILD_DIR}/${MODE}-${PLATFORM}/${MOCK_APP_NAME}.ipa" "${ARTIFACTS_DIR_NAME}/" + cp "${BUILD_DIR}/${MOCK_APP_NAME}.xctestrun.zip" "${ARTIFACTS_DIR_NAME}/" + + ls -lah "${ARTIFACTS_DIR_NAME}/" +fi diff --git a/codegen/templates/RegisterCodegenUnboxedKernels.cpp b/codegen/templates/RegisterCodegenUnboxedKernels.cpp index a7790be7fed..3076cde1a99 100644 --- a/codegen/templates/RegisterCodegenUnboxedKernels.cpp +++ b/codegen/templates/RegisterCodegenUnboxedKernels.cpp @@ -8,6 +8,7 @@ #include #include +#include #include #include #include "${fn_header}" // Generated Function import headers @@ -21,7 +22,8 @@ // JIT op registry instead of c10 dispatcher. JIT op registry only takes boxed // kernels, so we are calling unboxing functions in UnboxingFunctions.h to cast // arguments into C++ types (instead of IValue) and delegate to unboxed kernels. -using KernelArrayRef = ::torch::executor::ArrayRef<::torch::executor::Kernel>; +using KernelSpan = + ::executorch::runtime::Span; namespace torch { namespace executor { namespace function { @@ -31,15 +33,15 @@ static Kernel kernels_to_register[] = { ${unboxed_kernels} // Generated kernels }; -// Explicitly convert to ArrayRef, so that the API can take an empty C array of +// Explicitly convert to Span, so that the API can take an empty C array of // Kernels. -static KernelArrayRef kernel_array_ref( +static KernelSpan kernel_span( kernels_to_register, kernels_to_register + sizeof(kernels_to_register) / sizeof(Kernel)); // Return value not used. Keep the static variable assignment to register // kernels in static initialization time. -static auto success_with_kernel_reg = register_kernels(kernel_array_ref); +static auto success_with_kernel_reg = register_kernels(kernel_span); } // namespace } // namespace function } // namespace executor diff --git a/codegen/templates/RegisterKernels.cpp b/codegen/templates/RegisterKernels.cpp index 2313a30a307..91eac200222 100644 --- a/codegen/templates/RegisterKernels.cpp +++ b/codegen/templates/RegisterKernels.cpp @@ -19,7 +19,8 @@ Error register_all_kernels() { Kernel kernels_to_register[] = { ${unboxed_kernels} // Generated kernels }; - Error success_with_kernel_reg = register_kernels(kernels_to_register); + Error success_with_kernel_reg = + ::executorch::runtime::register_kernels({kernels_to_register}); if (success_with_kernel_reg != Error::Ok) { ET_LOG(Error, "Failed register all kernels"); return success_with_kernel_reg; diff --git a/codegen/tools/gen_oplist.py b/codegen/tools/gen_oplist.py index f21fb8dc6b5..fbb191a6a81 100644 --- a/codegen/tools/gen_oplist.py +++ b/codegen/tools/gen_oplist.py @@ -230,7 +230,7 @@ def gen_oplist( if model_file_path: assert os.path.isfile( model_file_path - ), "The value for --model_file_path needs to be a valid file." + ), f"The value for --model_file_path needs to be a valid file, got {model_file_path}" op_set.update(_get_operators(model_file_path)) source_name = model_file_path et_kernel_metadata = merge_et_kernel_metadata( @@ -239,7 +239,7 @@ def gen_oplist( if ops_schema_yaml_path: assert os.path.isfile( ops_schema_yaml_path - ), "The value for --ops_schema_yaml_path needs to be a valid file." + ), f"The value for --ops_schema_yaml_path needs to be a valid file, got {ops_schema_yaml_path}" et_kernel_metadata = merge_et_kernel_metadata( et_kernel_metadata, _get_et_kernel_metadata_from_ops_yaml(ops_schema_yaml_path), @@ -300,14 +300,33 @@ def main(args: List[Any]) -> None: ) options = parser.parse_args(args) - gen_oplist( - output_path=options.output_path, - model_file_path=options.model_file_path, - ops_schema_yaml_path=options.ops_schema_yaml_path, - root_ops=options.root_ops, - ops_dict=options.ops_dict, - include_all_operators=options.include_all_operators, - ) + try: + gen_oplist( + output_path=options.output_path, + model_file_path=options.model_file_path, + ops_schema_yaml_path=options.ops_schema_yaml_path, + root_ops=options.root_ops, + ops_dict=options.ops_dict, + include_all_operators=options.include_all_operators, + ) + except Exception as e: + command = ["python codegen/tools/gen_oplist.py"] + if options.model_file_path: + command.append(f"--model_file_path {options.model_file_path}") + if options.ops_schema_yaml_path: + command.append(f"--ops_schema_yaml_path {options.ops_schema_yaml_path}") + if options.root_ops: + command.append(f"--root_ops {options.root_ops}") + if options.ops_dict: + command.append(f"--ops_dict {options.ops_dict}") + if options.include_all_operators: + command.append("--include-all-operators") + repro_command = " ".join(command) + raise RuntimeError( + f"""Failed to generate selected_operators.yaml. Repro command: + {repro_command} + """ + ) from e if __name__ == "__main__": diff --git a/codegen/tools/test/test_gen_oplist.py b/codegen/tools/test/test_gen_oplist.py index d455ddb6899..bd1d0082489 100644 --- a/codegen/tools/test/test_gen_oplist.py +++ b/codegen/tools/test/test_gen_oplist.py @@ -42,7 +42,7 @@ def test_gen_op_list_with_wrong_path( mock_get_operators: NonCallableMock, ) -> None: args = ["--output_path=wrong_path", "--model_file_path=path2"] - with self.assertRaises(AssertionError): + with self.assertRaises(RuntimeError): gen_oplist.main(args) @patch("executorch.codegen.tools.gen_oplist._get_kernel_metadata_for_model") diff --git a/configurations/targets.bzl b/configurations/targets.bzl index dc88c137441..6a5341c2904 100644 --- a/configurations/targets.bzl +++ b/configurations/targets.bzl @@ -20,7 +20,7 @@ def define_common_targets(): runtime.cxx_library( name = "executor_cpu_optimized", exported_deps = [ - "//executorch/backends/xnnpack/threadpool:threadpool", + "//executorch/extension/threadpool:threadpool", ] + get_all_cpu_backend_targets(), visibility = [ "//executorch/test/...", @@ -28,7 +28,7 @@ def define_common_targets(): ], ) - # Add a commong configuration of cpu optimized operators. This adds a bit of confusion + # Add a common configuration of cpu optimized operators. This adds a bit of confusion # with the above executorch_cpu_optimized target. Generally it would make sense # to just add optimized operators to that target but because executorch_cpu_optimized # might be used elsewhere, I dont want to include ops in that target and find out @@ -50,3 +50,21 @@ def define_common_targets(): "@EXECUTORCH_CLIENTS", ], ) + + # TODO(T183193812): delete this target after optimized-oss.yaml is gone + executorch_generated_lib( + name = "optimized_native_cpu_ops_oss", + deps = [ + "//executorch/kernels/optimized:optimized_operators", + "//executorch/kernels/optimized:optimized_oplist", + "//executorch/kernels/portable:executorch_aten_ops", + "//executorch/kernels/portable:operators", + ], + functions_yaml_target = "//executorch/kernels/optimized:optimized-oss.yaml", + fallback_yaml_target = "//executorch/kernels/portable:functions.yaml", + define_static_targets = True, + visibility = [ + "//executorch/examples/...", + "@EXECUTORCH_CLIENTS", + ], + ) diff --git a/sdk/CMakeLists.txt b/devtools/CMakeLists.txt similarity index 86% rename from sdk/CMakeLists.txt rename to devtools/CMakeLists.txt index 79903fc315e..776d421a8d3 100644 --- a/sdk/CMakeLists.txt +++ b/devtools/CMakeLists.txt @@ -78,8 +78,8 @@ set_property(TARGET flatccrt PROPERTY POSITION_INDEPENDENT_CODE ON) include(ExternalProject) # The include directory that will contain the generated schema headers. -set(_program_schema__include_dir "${CMAKE_BINARY_DIR}/sdk/include") -set(_bundled_schema__include_dir "${CMAKE_BINARY_DIR}/sdk/bundled_program") +set(_program_schema__include_dir "${CMAKE_BINARY_DIR}/devtools/include") +set(_bundled_schema__include_dir "${CMAKE_BINARY_DIR}/devtools/bundled_program") # TODO(dbort): Only enable this when cross-compiling. It can cause build race # conditions (libflatcc.a errors) when enabled. @@ -92,11 +92,11 @@ if(EXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT) # headers on the host during the build, even if we're cross-compiling the # flatcc runtime to a different architecture. execute_process( - COMMAND ${CMAKE_COMMAND} ${_flatcc_source_dir} - -DFLATCC_TEST=OFF -DFLATCC_REFLECTION=OFF - # See above comment about POSITION_INDEPENDENT_CODE. - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -B${CMAKE_BINARY_DIR}/_host_build + COMMAND + ${CMAKE_COMMAND} ${_flatcc_source_dir} -DFLATCC_TEST=OFF + -DFLATCC_REFLECTION=OFF + # See above comment about POSITION_INDEPENDENT_CODE. + -DCMAKE_POSITION_INDEPENDENT_CODE=ON -B${CMAKE_BINARY_DIR}/_host_build ) execute_process( COMMAND ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR}/_host_build @@ -128,11 +128,11 @@ set(_etdump_schema__outputs) foreach(fbs_file ${_etdump_schema_names}) string(REGEX REPLACE "[.]fbs$" "_reader.h" generated "${fbs_file}") list(APPEND _etdump_schema__outputs - "${_program_schema__include_dir}/executorch/sdk/etdump/${generated}" + "${_program_schema__include_dir}/executorch/devtools/etdump/${generated}" ) string(REGEX REPLACE "[.]fbs$" "_builder.h" generated "${fbs_file}") list(APPEND _etdump_schema__outputs - "${_program_schema__include_dir}/executorch/sdk/etdump/${generated}" + "${_program_schema__include_dir}/executorch/devtools/etdump/${generated}" ) endforeach() @@ -143,7 +143,7 @@ foreach(fbs_file ${_bundled_input_schema_names}) list( APPEND _bundled_program_schema__outputs - "${_bundled_schema__include_dir}/executorch/sdk/bundled_program/schema/${generated}" + "${_bundled_schema__include_dir}/executorch/devtools/bundled_program/schema/${generated}" ) endforeach() @@ -152,9 +152,9 @@ add_library( bundled_program_schema INTERFACE ${_bundled_program_schema__outputs} ) -file(MAKE_DIRECTORY ${_program_schema__include_dir}/executorch/sdk/etdump) +file(MAKE_DIRECTORY ${_program_schema__include_dir}/executorch/devtools/etdump) file(MAKE_DIRECTORY - ${_program_schema__include_dir}/executorch/sdk/bundled_program + ${_program_schema__include_dir}/executorch/devtools/bundled_program ) add_custom_command( @@ -164,7 +164,7 @@ add_custom_command( # tree instead of under the binary directory, and there's no way to change # that behavior. ${_flatcc_source_dir}/bin/flatcc -cwr -o - ${_program_schema__include_dir}/executorch/sdk/etdump + ${_program_schema__include_dir}/executorch/devtools/etdump ${_etdump_schema__srcs} COMMAND rm -f ${_etdump_schema_cleanup_paths} DEPENDS ${_etdump_schema_gen_dep} @@ -186,9 +186,9 @@ add_custom_command( OUTPUT ${_bundled_program_schema__outputs} COMMAND ${FLATC_EXECUTABLE} --cpp --cpp-std c++11 --gen-mutable --scoped-enums -o - "${_bundled_schema__include_dir}/executorch/sdk/bundled_program/schema" + "${_bundled_schema__include_dir}/executorch/devtools/bundled_program/schema" ${_bundled_program_schema__srcs} - WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/sdk + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/devtools DEPENDS ${FLATC_EXECUTABLE} ${_bundled_program_schema__srcs} COMMENT "Generating bundled_program headers" VERBATIM diff --git a/sdk/TARGETS b/devtools/TARGETS similarity index 54% rename from sdk/TARGETS rename to devtools/TARGETS index 56d38a4ad3b..06964b83876 100644 --- a/sdk/TARGETS +++ b/devtools/TARGETS @@ -6,8 +6,8 @@ python_library( name = "lib", srcs = ["__init__.py"], deps = [ - "//executorch/sdk/bundled_program:core", - "//executorch/sdk/etrecord:etrecord", - "//executorch/sdk/inspector:lib", + "//executorch/devtools/bundled_program:core", + "//executorch/devtools/etrecord:etrecord", + "//executorch/devtools/inspector:lib", ], ) diff --git a/sdk/__init__.py b/devtools/__init__.py similarity index 57% rename from sdk/__init__.py rename to devtools/__init__.py index 11134bf276a..821d75901f2 100644 --- a/sdk/__init__.py +++ b/devtools/__init__.py @@ -4,10 +4,10 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import executorch.sdk.inspector as inspector -from executorch.sdk.bundled_program.core import BundledProgram -from executorch.sdk.etrecord import ETRecord, generate_etrecord, parse_etrecord -from executorch.sdk.inspector import Inspector +import executorch.devtools.inspector as inspector +from executorch.devtools.bundled_program.core import BundledProgram +from executorch.devtools.etrecord import ETRecord, generate_etrecord, parse_etrecord +from executorch.devtools.inspector import Inspector __all__ = [ "ETRecord", diff --git a/sdk/backend_debug/TARGETS b/devtools/backend_debug/TARGETS similarity index 100% rename from sdk/backend_debug/TARGETS rename to devtools/backend_debug/TARGETS diff --git a/sdk/backend_debug/__init__.py b/devtools/backend_debug/__init__.py similarity index 83% rename from sdk/backend_debug/__init__.py rename to devtools/backend_debug/__init__.py index c1c9726b86b..b457b7d11d5 100644 --- a/sdk/backend_debug/__init__.py +++ b/devtools/backend_debug/__init__.py @@ -4,7 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from executorch.sdk.backend_debug.delegation_info import ( +from executorch.devtools.backend_debug.delegation_info import ( DelegationBreakdown, get_delegation_info, ) diff --git a/sdk/backend_debug/delegation_info.py b/devtools/backend_debug/delegation_info.py similarity index 100% rename from sdk/backend_debug/delegation_info.py rename to devtools/backend_debug/delegation_info.py diff --git a/sdk/backend_debug/tests/TARGETS b/devtools/backend_debug/tests/TARGETS similarity index 86% rename from sdk/backend_debug/tests/TARGETS rename to devtools/backend_debug/tests/TARGETS index 3c9f6c2e64e..ae234df8ce4 100644 --- a/sdk/backend_debug/tests/TARGETS +++ b/devtools/backend_debug/tests/TARGETS @@ -10,8 +10,8 @@ python_unittest( deps = [ "fbsource//third-party/pypi/pandas:pandas", "//caffe2:torch", + "//executorch/devtools/backend_debug:delegation_info", "//executorch/exir:lib", "//executorch/exir/backend/test:op_partitioner_demo", - "//executorch/sdk/backend_debug:delegation_info", ], ) diff --git a/sdk/backend_debug/tests/test_delegation_info.py b/devtools/backend_debug/tests/test_delegation_info.py similarity index 96% rename from sdk/backend_debug/tests/test_delegation_info.py rename to devtools/backend_debug/tests/test_delegation_info.py index 2d98e9a5950..6ff5169094b 100644 --- a/sdk/backend_debug/tests/test_delegation_info.py +++ b/devtools/backend_debug/tests/test_delegation_info.py @@ -9,9 +9,9 @@ import pandas as pd import torch +from executorch.devtools.backend_debug import DelegationBreakdown, get_delegation_info from executorch.exir import to_edge from executorch.exir.backend.test.op_partitioner_demo import AddMulPartitionerDemo -from executorch.sdk.backend_debug import DelegationBreakdown, get_delegation_info from pandas.testing import assert_frame_equal diff --git a/sdk/bundled_program/TARGETS b/devtools/bundled_program/TARGETS similarity index 88% rename from sdk/bundled_program/TARGETS rename to devtools/bundled_program/TARGETS index c731606217f..27560f70877 100644 --- a/sdk/bundled_program/TARGETS +++ b/devtools/bundled_program/TARGETS @@ -18,10 +18,10 @@ runtime.python_library( ":config", ":version", "//caffe2:torch", + "//executorch/devtools/bundled_program/schema:bundled_program_schema_py", "//executorch/exir:schema", "//executorch/exir:tensor", "//executorch/exir/_serialize:lib", - "//executorch/sdk/bundled_program/schema:bundled_program_schema_py", ], ) @@ -46,6 +46,6 @@ runtime.python_library( "version.py", ], visibility = [ - "//executorch/sdk/...", + "//executorch/devtools/...", ], ) diff --git a/sdk/bundled_program/bundled_program.cpp b/devtools/bundled_program/bundled_program.cpp similarity index 91% rename from sdk/bundled_program/bundled_program.cpp rename to devtools/bundled_program/bundled_program.cpp index 63affa5c7f7..54f84f6fef1 100644 --- a/sdk/bundled_program/bundled_program.cpp +++ b/devtools/bundled_program/bundled_program.cpp @@ -6,7 +6,7 @@ * LICENSE file in the root directory of this source tree. */ -#include +#include #include #include @@ -16,20 +16,28 @@ #include #endif // USE_ATEN_LIB +#include #include #include #include #include #include -#include -namespace torch { -namespace executor { +using exec_aten::ArrayRef; +using exec_aten::Half; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using ::executorch::runtime::Error; +using ::executorch::runtime::EValue; +using ::executorch::runtime::Method; +using ::executorch::runtime::Result; + +namespace executorch { namespace bundled_program { namespace { -#define kMaxDim 16 +constexpr size_t kMaxDim = 16; #ifdef USE_ATEN_LIB @@ -53,6 +61,7 @@ at::Tensor tensor_like(bundled_program_flatbuffer::Tensor* bundled_tensor) { } #else // !USE_ATEN_LIB +using torch::executor::TensorImpl; // Create a tensorimpl with same content using bundled tensor TensorImpl impl_like(bundled_program_flatbuffer::Tensor* bundled_tensor) { ScalarType scalar_type = @@ -234,9 +243,9 @@ get_method_test_suite( } // namespace // Load testset_idx-th bundled data into the Method -ET_NODISCARD Error LoadBundledInput( +ET_NODISCARD Error load_bundled_input( Method& method, - serialized_bundled_program* bundled_program_ptr, + SerializedBundledProgram* bundled_program_ptr, size_t testset_idx) { ET_CHECK_OR_RETURN_ERROR( bundled_program_flatbuffer::BundledProgramBufferHasIdentifier( @@ -319,19 +328,19 @@ ET_NODISCARD Error LoadBundledInput( ET_CHECK_OR_RETURN_ERROR( status == Error::Ok, NotSupported, - "set_input failed during load bundled inputs with status %" PRIu32, - static_cast(status)); + "set_input failed during load bundled inputs with status 0%" PRIx32, + static_cast(status)); } - internal::event_tracer_set_bundled_input_index( + ::executorch::runtime::internal::event_tracer_set_bundled_input_index( method.get_event_tracer(), testset_idx); return Error::Ok; } -ET_NODISCARD Error VerifyResultWithBundledExpectedOutput( +ET_NODISCARD Error verify_method_outputs( Method& method, - serialized_bundled_program* bundled_program_ptr, + SerializedBundledProgram* bundled_program_ptr, size_t testset_idx, double rtol, double atol) { @@ -390,12 +399,12 @@ ET_NODISCARD Error VerifyResultWithBundledExpectedOutput( return Error::Ok; } -ET_NODISCARD Error GetProgramData( +ET_NODISCARD Error get_program_data( void* file_data, size_t file_data_len, const void** out_program_data, size_t* out_program_data_len) { - if (IsBundledProgram(file_data)) { + if (is_bundled_program(file_data, file_data_len)) { auto program_bundled = bundled_program_flatbuffer::GetBundledProgram(file_data); *out_program_data = program_bundled->program()->data(); @@ -410,11 +419,13 @@ ET_NODISCARD Error GetProgramData( return Error::Ok; } -bool IsBundledProgram(void* file_data) { +bool is_bundled_program(void* file_data, ET_UNUSED size_t file_data_len) { + // Even though the flatbuffer API doesn't accept a length, it's important to + // require one so that we could change the internal representation, or use a + // future API that does require a length. return bundled_program_flatbuffer::BundledProgramBufferHasIdentifier( file_data); } } // namespace bundled_program -} // namespace executor -} // namespace torch +} // namespace executorch diff --git a/sdk/bundled_program/bundled_program.h b/devtools/bundled_program/bundled_program.h similarity index 55% rename from sdk/bundled_program/bundled_program.h rename to devtools/bundled_program/bundled_program.h index 8b42923866e..884ca6f21bc 100644 --- a/sdk/bundled_program/bundled_program.h +++ b/devtools/bundled_program/bundled_program.h @@ -11,14 +11,13 @@ #include #include -namespace torch { -namespace executor { +namespace executorch { namespace bundled_program { /** * An opaque pointer to a serialized bundled program. */ -using serialized_bundled_program = const void; +using SerializedBundledProgram = const void; /** * Load testset_idx-th bundled input of method_idx-th Method test in @@ -31,9 +30,9 @@ using serialized_bundled_program = const void; * @returns Return Error::Ok if load successfully, or the error happens during * execution. */ -ET_NODISCARD Error LoadBundledInput( - Method& method, - serialized_bundled_program* bundled_program_ptr, +ET_NODISCARD ::executorch::runtime::Error load_bundled_input( + ::executorch::runtime::Method& method, + SerializedBundledProgram* bundled_program_ptr, size_t testset_idx); /** @@ -49,9 +48,9 @@ ET_NODISCARD Error LoadBundledInput( * @returns Return Error::Ok if two outputs match, or the error happens during * execution. */ -ET_NODISCARD Error VerifyResultWithBundledExpectedOutput( - Method& method, - serialized_bundled_program* bundled_program_ptr, +ET_NODISCARD ::executorch::runtime::Error verify_method_outputs( + ::executorch::runtime::Method& method, + SerializedBundledProgram* bundled_program_ptr, size_t testset_idx, double rtol = 1e-5, double atol = 1e-8); @@ -73,7 +72,7 @@ ET_NODISCARD Error VerifyResultWithBundledExpectedOutput( * in it, and out_program_data/out_program_data_len point to the data. Other * values on failure. */ -ET_NODISCARD Error GetProgramData( +ET_NODISCARD ::executorch::runtime::Error get_program_data( void* file_data, size_t file_data_len, const void** out_program_data, @@ -83,11 +82,61 @@ ET_NODISCARD Error GetProgramData( * Checks whether the given file is a bundled program. * * @param[in] file_data The contents of the given file. + * @param[in] file_data_len The length of file_data, in bytes. * * @returns true if the given file is a bundled program, false otherwise */ -bool IsBundledProgram(void* file_data); +bool is_bundled_program(void* file_data, size_t file_data_len); + +/// DEPRECATED: Use the version with the file_data_len parameter. +ET_DEPRECATED inline bool is_bundled_program(void* file_data) { + // 128 is enough data to contain the identifier in the flatbuffer header. + return is_bundled_program(file_data, 128); +} + +} // namespace bundled_program +} // namespace executorch + +namespace torch { +namespace executor { +namespace bundled_program { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using serialized_bundled_program = + ::executorch::bundled_program::SerializedBundledProgram; + +ET_NODISCARD inline ::executorch::runtime::Error LoadBundledInput( + ::executorch::runtime::Method& method, + serialized_bundled_program* bundled_program_ptr, + size_t testset_idx) { + return ::executorch::bundled_program::load_bundled_input( + method, bundled_program_ptr, testset_idx); +} + +ET_NODISCARD inline ::executorch::runtime::Error +VerifyResultWithBundledExpectedOutput( + ::executorch::runtime::Method& method, + serialized_bundled_program* bundled_program_ptr, + size_t testset_idx, + double rtol = 1e-5, + double atol = 1e-8) { + return ::executorch::bundled_program::verify_method_outputs( + method, bundled_program_ptr, testset_idx, rtol, atol); +} + +ET_NODISCARD inline ::executorch::runtime::Error GetProgramData( + void* file_data, + size_t file_data_len, + const void** out_program_data, + size_t* out_program_data_len) { + return ::executorch::bundled_program::get_program_data( + file_data, file_data_len, out_program_data, out_program_data_len); +} +inline bool IsBundledProgram(void* file_data) { + // 128 is enough data to contain the identifier in the flatbuffer header. + return ::executorch::bundled_program::is_bundled_program(file_data, 128); +} } // namespace bundled_program } // namespace executor } // namespace torch diff --git a/sdk/bundled_program/config.py b/devtools/bundled_program/config.py similarity index 88% rename from sdk/bundled_program/config.py rename to devtools/bundled_program/config.py index 3bfbe7bc69c..97563177603 100644 --- a/sdk/bundled_program/config.py +++ b/devtools/bundled_program/config.py @@ -39,7 +39,7 @@ """ All supported types for input/expected output of MethodTestCase. -Namedtuple is also supported and listed implicity since it is a subclass of tuple. +Namedtuple is also supported and listed implicitly since it is a subclass of tuple. """ # pyre-ignore @@ -59,23 +59,23 @@ def __init__( """Single test case for verifying specific method Args: - input: All inputs required by eager_model with specific inference method for one-time execution. + inputs: All inputs required by eager_model with specific inference method for one-time execution. It is worth mentioning that, although both bundled program and ET runtime apis support setting input other than `torch.tensor` type, only the input in `torch.tensor` type will be actually updated in the method, and the rest of the inputs will just do a sanity check if they match the default value in method. - expected_output: Expected output of given input for verification. It can be None if user only wants to use the test case for profiling. + expected_outputs: Expected output of given input for verification. It can be None if user only wants to use the test case for profiling. Returns: self """ # TODO(gasoonjia): Update type check logic. - # pyre-ignore [6]: Misalign data type for between MethodTestCase attribute and sannity check. + # pyre-ignore [6]: Misalign data type for between MethodTestCase attribute and sanity check. self.inputs: List[ConfigValue] = self._flatten_and_sanity_check(inputs) self.expected_outputs: List[ConfigValue] = [] if expected_outputs is not None: - # pyre-ignore [6]: Misalign data type for between MethodTestCase attribute and sannity check. + # pyre-ignore [6]: Misalign data type for between MethodTestCase attribute and sanity check. self.expected_outputs = self._flatten_and_sanity_check(expected_outputs) def _flatten_and_sanity_check( diff --git a/sdk/bundled_program/core.py b/devtools/bundled_program/core.py similarity index 98% rename from sdk/bundled_program/core.py rename to devtools/bundled_program/core.py index 4fede5e5952..c775fb1510d 100644 --- a/sdk/bundled_program/core.py +++ b/devtools/bundled_program/core.py @@ -8,19 +8,19 @@ import typing from typing import Dict, List, Optional, Sequence, Type, Union -import executorch.exir.schema as core_schema +import executorch.devtools.bundled_program.schema as bp_schema -import executorch.sdk.bundled_program.schema as bp_schema +import executorch.exir.schema as core_schema import torch import torch.fx +from executorch.devtools.bundled_program.config import ConfigValue, MethodTestSuite + +from executorch.devtools.bundled_program.version import BUNDLED_PROGRAM_SCHEMA_VERSION from executorch.exir import ExecutorchProgram, ExecutorchProgramManager from executorch.exir._serialize import _serialize_pte_binary from executorch.exir.tensor import get_scalar_type, scalar_type_enum, TensorSpec -from executorch.sdk.bundled_program.config import ConfigValue, MethodTestSuite - -from executorch.sdk.bundled_program.version import BUNDLED_PROGRAM_SCHEMA_VERSION # pyre-ignore supported_program_type_table: Dict[Type[core_schema.KernelTypes], ConfigValue] = { @@ -230,7 +230,7 @@ def _assert_valid_bundle( Other checks not related to correspondence are done in config.py Args: - program: The program to be bundled. + executorch_program: The program to be bundled. method_test_suites: The testcases for specific methods to be bundled. """ diff --git a/sdk/bundled_program/schema/README.md b/devtools/bundled_program/schema/README.md similarity index 100% rename from sdk/bundled_program/schema/README.md rename to devtools/bundled_program/schema/README.md diff --git a/sdk/bundled_program/schema/TARGETS b/devtools/bundled_program/schema/TARGETS similarity index 84% rename from sdk/bundled_program/schema/TARGETS rename to devtools/bundled_program/schema/TARGETS index e9bd642069d..51c004cbec0 100644 --- a/sdk/bundled_program/schema/TARGETS +++ b/devtools/bundled_program/schema/TARGETS @@ -15,8 +15,8 @@ runtime.python_library( "bundled_program_schema.py", ], visibility = [ - "//executorch/sdk/bundled_program/...", - "//executorch/sdk/etrecord/...", + "//executorch/devtools/bundled_program/...", + "//executorch/devtools/etrecord/...", ], deps = [ "//executorch/exir:scalar_type", diff --git a/sdk/bundled_program/schema/__init__.py b/devtools/bundled_program/schema/__init__.py similarity index 100% rename from sdk/bundled_program/schema/__init__.py rename to devtools/bundled_program/schema/__init__.py diff --git a/sdk/bundled_program/schema/bundled_program_schema.fbs b/devtools/bundled_program/schema/bundled_program_schema.fbs similarity index 100% rename from sdk/bundled_program/schema/bundled_program_schema.fbs rename to devtools/bundled_program/schema/bundled_program_schema.fbs diff --git a/sdk/bundled_program/schema/bundled_program_schema.py b/devtools/bundled_program/schema/bundled_program_schema.py similarity index 100% rename from sdk/bundled_program/schema/bundled_program_schema.py rename to devtools/bundled_program/schema/bundled_program_schema.py diff --git a/sdk/bundled_program/schema/scalar_type.fbs b/devtools/bundled_program/schema/scalar_type.fbs similarity index 100% rename from sdk/bundled_program/schema/scalar_type.fbs rename to devtools/bundled_program/schema/scalar_type.fbs diff --git a/sdk/bundled_program/schema/targets.bzl b/devtools/bundled_program/schema/targets.bzl similarity index 93% rename from sdk/bundled_program/schema/targets.bzl rename to devtools/bundled_program/schema/targets.bzl index a25d792c5a3..532a01e039e 100644 --- a/sdk/bundled_program/schema/targets.bzl +++ b/devtools/bundled_program/schema/targets.bzl @@ -49,14 +49,14 @@ def define_common_targets(): runtime.export_file( name = INPUT_BUNDLED, visibility = [ - "//executorch/sdk/bundled_program/serialize/...", + "//executorch/devtools/bundled_program/serialize/...", ], ) runtime.export_file( name = INPUT_SCALAR_TYPE, visibility = [ - "//executorch/sdk/bundled_program/serialize/...", + "//executorch/devtools/bundled_program/serialize/...", ], ) @@ -72,7 +72,7 @@ def define_common_targets(): name = BUNDLED_LIBRARY_NAME, srcs = [], visibility = [ - "//executorch/sdk/bundled_program/...", + "//executorch/devtools/bundled_program/...", "//executorch/extension/pybindings/...", ], exported_headers = { diff --git a/sdk/bundled_program/schema/test/TARGETS b/devtools/bundled_program/schema/test/TARGETS similarity index 100% rename from sdk/bundled_program/schema/test/TARGETS rename to devtools/bundled_program/schema/test/TARGETS diff --git a/sdk/bundled_program/schema/test/test_schema.py b/devtools/bundled_program/schema/test/test_schema.py similarity index 79% rename from sdk/bundled_program/schema/test/test_schema.py rename to devtools/bundled_program/schema/test/test_schema.py index ab3d2760d29..c2a19adef79 100644 --- a/sdk/bundled_program/schema/test/test_schema.py +++ b/devtools/bundled_program/schema/test/test_schema.py @@ -20,8 +20,8 @@ def test_schema_sync(self) -> None: self.assertTrue( filecmp.cmp( - prefix + "sdk/bundled_program/schema/scalar_type.fbs", + prefix + "devtools/bundled_program/schema/scalar_type.fbs", prefix + "schema/scalar_type.fbs", ), - 'Please run "hg cp fbcode//executorch/schema/scalar_type.fbs fbcode//executorch/sdk/bundled_program/schema/scalar_type.fbs" to sync schema changes.', + 'Please run "hg cp fbcode//executorch/schema/scalar_type.fbs fbcode//executorch/devtools/bundled_program/schema/scalar_type.fbs" to sync schema changes.', ) diff --git a/sdk/bundled_program/serialize/TARGETS b/devtools/bundled_program/serialize/TARGETS similarity index 76% rename from sdk/bundled_program/serialize/TARGETS rename to devtools/bundled_program/serialize/TARGETS index 20abccd7fda..11c58399778 100644 --- a/sdk/bundled_program/serialize/TARGETS +++ b/devtools/bundled_program/serialize/TARGETS @@ -10,8 +10,8 @@ runtime.python_library( "__init__.py", ], resources = { - "//executorch/sdk/bundled_program/schema:bundled_program_schema.fbs": "bundled_program_schema.fbs", - "//executorch/sdk/bundled_program/schema:scalar_type.fbs": "scalar_type.fbs", + "//executorch/devtools/bundled_program/schema:bundled_program_schema.fbs": "bundled_program_schema.fbs", + "//executorch/devtools/bundled_program/schema:scalar_type.fbs": "scalar_type.fbs", }, # Currently serialization API should only be used in some dedicated targets, # to avoid ODR violation when linking with another Flatbuffers library. @@ -20,18 +20,18 @@ runtime.python_library( "//executorch/bacends/...", "//executorch/backends/xnnpack/test/...", "//executorch/codegen/...", + "//executorch/devtools/bundled_program/tests/...", "//executorch/examples/async_exec:emit_program_lib", "//executorch/exir:lib", "//executorch/extension/pybindings/test:test", "//executorch/extension/pybindings/test:test-library", "//executorch/profiler/...", - "//executorch/sdk/bundled_program/tests/...", "//executorch/test/...", "@EXECUTORCH_CLIENTS", ], deps = [ "fbsource//third-party/pypi/setuptools:setuptools", + "//executorch/devtools/bundled_program/schema:bundled_program_schema_py", "//executorch/exir/_serialize:lib", - "//executorch/sdk/bundled_program/schema:bundled_program_schema_py", ], ) diff --git a/sdk/bundled_program/serialize/__init__.py b/devtools/bundled_program/serialize/__init__.py similarity index 97% rename from sdk/bundled_program/serialize/__init__.py rename to devtools/bundled_program/serialize/__init__.py index e0c75574c93..075436e9c11 100644 --- a/sdk/bundled_program/serialize/__init__.py +++ b/devtools/bundled_program/serialize/__init__.py @@ -12,14 +12,14 @@ import os import tempfile -import executorch.sdk.bundled_program.schema as bp_schema +import executorch.devtools.bundled_program.schema as bp_schema # @manual=fbsource//third-party/pypi/setuptools:setuptools import pkg_resources +from executorch.devtools.bundled_program.core import BundledProgram from executorch.exir._serialize._dataclass import _DataclassEncoder, _json_to_dataclass from executorch.exir._serialize._flatbuffer import _flatc_compile, _flatc_decompile -from executorch.sdk.bundled_program.core import BundledProgram # The prefix of schema files used for bundled program BUNDLED_PROGRAM_SCHEMA_NAME = "bundled_program_schema" diff --git a/sdk/bundled_program/serialize/test/TARGETS b/devtools/bundled_program/serialize/test/TARGETS similarity index 51% rename from sdk/bundled_program/serialize/test/TARGETS rename to devtools/bundled_program/serialize/test/TARGETS index 85f55c02f8d..dd92f63f2dd 100644 --- a/sdk/bundled_program/serialize/test/TARGETS +++ b/devtools/bundled_program/serialize/test/TARGETS @@ -10,9 +10,8 @@ python_unittest( "test_serialize.py", ], deps = [ - "//executorch/exir:print_program", - "//executorch/sdk/bundled_program:core", - "//executorch/sdk/bundled_program/serialize:lib", - "//executorch/sdk/bundled_program/util:test_util", + "//executorch/devtools/bundled_program:core", + "//executorch/devtools/bundled_program/serialize:lib", + "//executorch/devtools/bundled_program/util:test_util", ], ) diff --git a/sdk/bundled_program/serialize/test/test_serialize.py b/devtools/bundled_program/serialize/test/test_serialize.py similarity index 82% rename from sdk/bundled_program/serialize/test/test_serialize.py rename to devtools/bundled_program/serialize/test/test_serialize.py index 1db6871fc06..48a914d1447 100644 --- a/sdk/bundled_program/serialize/test/test_serialize.py +++ b/devtools/bundled_program/serialize/test/test_serialize.py @@ -8,13 +8,15 @@ import unittest -from executorch.sdk.bundled_program.core import BundledProgram +from executorch.devtools.bundled_program.core import BundledProgram -from executorch.sdk.bundled_program.serialize import ( +from executorch.devtools.bundled_program.serialize import ( deserialize_from_flatbuffer_to_bundled_program, serialize_from_bundled_program_to_flatbuffer, ) -from executorch.sdk.bundled_program.util.test_util import get_common_executorch_program +from executorch.devtools.bundled_program.util.test_util import ( + get_common_executorch_program, +) class TestSerialize(unittest.TestCase): diff --git a/sdk/bundled_program/targets.bzl b/devtools/bundled_program/targets.bzl similarity index 91% rename from sdk/bundled_program/targets.bzl rename to devtools/bundled_program/targets.bzl index a3268dff2c5..7035b3b31f6 100644 --- a/sdk/bundled_program/targets.bzl +++ b/devtools/bundled_program/targets.bzl @@ -19,7 +19,7 @@ def define_common_targets(): ], deps = [ "//executorch/runtime/core/exec_aten/util:dim_order_util" + aten_suffix, - "//executorch/sdk/bundled_program/schema:bundled_program_schema_fbs", + "//executorch/devtools/bundled_program/schema:bundled_program_schema_fbs", ], exported_deps = [ "//executorch/runtime/core:memory_allocator", diff --git a/sdk/bundled_program/test/TARGETS b/devtools/bundled_program/test/TARGETS similarity index 68% rename from sdk/bundled_program/test/TARGETS rename to devtools/bundled_program/test/TARGETS index caf69be60e1..652c74b8f43 100644 --- a/sdk/bundled_program/test/TARGETS +++ b/devtools/bundled_program/test/TARGETS @@ -1,4 +1,5 @@ # @noautodeps + load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest") oncall("executorch") @@ -10,11 +11,11 @@ python_unittest( ], deps = [ "//caffe2:torch", + "//executorch/devtools/bundled_program:config", + "//executorch/devtools/bundled_program:core", + "//executorch/devtools/bundled_program/schema:bundled_program_schema_py", + "//executorch/devtools/bundled_program/util:test_util", "//executorch/exir/_serialize:lib", - "//executorch/sdk/bundled_program:config", - "//executorch/sdk/bundled_program:core", - "//executorch/sdk/bundled_program/schema:bundled_program_schema_py", - "//executorch/sdk/bundled_program/util:test_util", ], ) @@ -25,9 +26,9 @@ python_unittest( ], deps = [ "//caffe2:torch", + "//executorch/devtools/bundled_program:config", + "//executorch/devtools/bundled_program/util:test_util", "//executorch/extension/pytree:pylib", - "//executorch/sdk/bundled_program:config", - "//executorch/sdk/bundled_program/util:test_util", ], ) @@ -38,6 +39,10 @@ python_unittest( ], deps = [ "//caffe2:torch", + "//executorch/devtools/bundled_program:config", + "//executorch/devtools/bundled_program:core", + "//executorch/devtools/bundled_program/serialize:lib", + "//executorch/devtools/bundled_program/util:test_util", "//executorch/exir:dynamic_shape", "//executorch/exir:lib", "//executorch/exir:memory", @@ -54,9 +59,5 @@ python_unittest( "//executorch/extension/pybindings:portable_lib", "//executorch/extension/pytree:pybindings", "//executorch/kernels/portable:custom_ops_generated_lib", - "//executorch/sdk/bundled_program:config", - "//executorch/sdk/bundled_program:core", - "//executorch/sdk/bundled_program/serialize:lib", - "//executorch/sdk/bundled_program/util:test_util", ], ) diff --git a/sdk/bundled_program/test/test_bundle_data.py b/devtools/bundled_program/test/test_bundle_data.py similarity index 93% rename from sdk/bundled_program/test/test_bundle_data.py rename to devtools/bundled_program/test/test_bundle_data.py index a8d9485c5ff..565539cbf15 100644 --- a/sdk/bundled_program/test/test_bundle_data.py +++ b/devtools/bundled_program/test/test_bundle_data.py @@ -9,13 +9,15 @@ import unittest from typing import List -import executorch.sdk.bundled_program.schema as bp_schema +import executorch.devtools.bundled_program.schema as bp_schema import torch +from executorch.devtools.bundled_program.config import ConfigValue +from executorch.devtools.bundled_program.core import BundledProgram +from executorch.devtools.bundled_program.util.test_util import ( + get_common_executorch_program, +) from executorch.exir._serialize import _serialize_pte_binary -from executorch.sdk.bundled_program.config import ConfigValue -from executorch.sdk.bundled_program.core import BundledProgram -from executorch.sdk.bundled_program.util.test_util import get_common_executorch_program class TestBundle(unittest.TestCase): diff --git a/sdk/bundled_program/test/test_config.py b/devtools/bundled_program/test/test_config.py similarity index 97% rename from sdk/bundled_program/test/test_config.py rename to devtools/bundled_program/test/test_config.py index 3183ad907fe..21f3d480423 100644 --- a/sdk/bundled_program/test/test_config.py +++ b/devtools/bundled_program/test/test_config.py @@ -10,14 +10,14 @@ from typing import get_args, List, Union import torch -from executorch.extension.pytree import tree_flatten -from executorch.sdk.bundled_program.config import DataContainer +from executorch.devtools.bundled_program.config import DataContainer -from executorch.sdk.bundled_program.util.test_util import ( +from executorch.devtools.bundled_program.util.test_util import ( get_random_test_suites, get_random_test_suites_with_eager_model, SampleModel, ) +from executorch.extension.pytree import tree_flatten class TestConfig(unittest.TestCase): diff --git a/sdk/bundled_program/test/test_end2end.py b/devtools/bundled_program/test/test_end2end.py similarity index 88% rename from sdk/bundled_program/test/test_end2end.py rename to devtools/bundled_program/test/test_end2end.py index 99d58ee15ca..7cee073be0e 100644 --- a/sdk/bundled_program/test/test_end2end.py +++ b/devtools/bundled_program/test/test_end2end.py @@ -21,12 +21,12 @@ import torch -from executorch.sdk.bundled_program.core import BundledProgram -from executorch.sdk.bundled_program.serialize import ( +from executorch.devtools.bundled_program.core import BundledProgram +from executorch.devtools.bundled_program.serialize import ( serialize_from_bundled_program_to_flatbuffer, ) -from executorch.sdk.bundled_program.util.test_util import ( +from executorch.devtools.bundled_program.util.test_util import ( get_common_executorch_program, SampleModel, ) @@ -45,7 +45,7 @@ pass try: - from executorch.extension.pybindings.aten_lib import ( + from executorch.extension.pybindings.aten_lib import ( # @manual=//executorch/extension/pybindings:aten_lib _load_bundled_program_from_buffer, _load_for_executorch_from_buffer, _load_for_executorch_from_bundled_program, diff --git a/sdk/bundled_program/util/TARGETS b/devtools/bundled_program/util/TARGETS similarity index 68% rename from sdk/bundled_program/util/TARGETS rename to devtools/bundled_program/util/TARGETS index 17d19dfb29a..7d019ce30fb 100644 --- a/sdk/bundled_program/util/TARGETS +++ b/devtools/bundled_program/util/TARGETS @@ -7,10 +7,10 @@ python_library( srcs = [ "test_util.py", ], - visibility = ["//executorch/sdk/bundled_program/..."], + visibility = ["//executorch/devtools/bundled_program/..."], deps = [ "//caffe2:torch", + "//executorch/devtools/bundled_program:config", "//executorch/exir:lib", - "//executorch/sdk/bundled_program:config", ], ) diff --git a/sdk/bundled_program/util/test_util.py b/devtools/bundled_program/util/test_util.py similarity index 99% rename from sdk/bundled_program/util/test_util.py rename to devtools/bundled_program/util/test_util.py index bfea8158acb..505186f3a08 100644 --- a/sdk/bundled_program/util/test_util.py +++ b/devtools/bundled_program/util/test_util.py @@ -10,14 +10,14 @@ from typing import List, Tuple import torch - -from executorch.exir import ExecutorchProgramManager, to_edge -from executorch.sdk.bundled_program.config import ( +from executorch.devtools.bundled_program.config import ( MethodInputType, MethodOutputType, MethodTestCase, MethodTestSuite, ) + +from executorch.exir import ExecutorchProgramManager, to_edge from torch.export import export from torch.export.unflatten import _assign_attr, _AttrKind diff --git a/sdk/bundled_program/version.py b/devtools/bundled_program/version.py similarity index 100% rename from sdk/bundled_program/version.py rename to devtools/bundled_program/version.py diff --git a/sdk/debug_format/TARGETS b/devtools/debug_format/TARGETS similarity index 100% rename from sdk/debug_format/TARGETS rename to devtools/debug_format/TARGETS diff --git a/sdk/debug_format/base_schema.py b/devtools/debug_format/base_schema.py similarity index 94% rename from sdk/debug_format/base_schema.py rename to devtools/debug_format/base_schema.py index b987c288744..9b6247051ec 100644 --- a/sdk/debug_format/base_schema.py +++ b/devtools/debug_format/base_schema.py @@ -4,8 +4,10 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + """ -Base Intermediate Representation for Productivity SDK consumers +Base Intermediate Representation for Developer Tools consumers (e.g. TensorBoard, Terminal Debugger) """ diff --git a/sdk/debug_format/et_schema.py b/devtools/debug_format/et_schema.py similarity index 99% rename from sdk/debug_format/et_schema.py rename to devtools/debug_format/et_schema.py index 9a6af4edba9..bb15d70abc4 100644 --- a/sdk/debug_format/et_schema.py +++ b/devtools/debug_format/et_schema.py @@ -4,8 +4,10 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + """ -Intermediate Representation of ExecuTorch Concepts in Productivity SDK +Intermediate Representation of ExecuTorch Concepts in Developer Tools """ from __future__ import annotations @@ -21,7 +23,7 @@ import torch from executorch import exir -from executorch.sdk.debug_format.base_schema import ( +from executorch.devtools.debug_format.base_schema import ( Node, OperatorGraph, OperatorNode, diff --git a/sdk/etdump/TARGETS b/devtools/etdump/TARGETS similarity index 81% rename from sdk/etdump/TARGETS rename to devtools/etdump/TARGETS index 22d07478cbe..7dcc4c1e84b 100644 --- a/sdk/etdump/TARGETS +++ b/devtools/etdump/TARGETS @@ -11,7 +11,7 @@ runtime.python_library( "schema_flatcc.py", ], visibility = [ - "//executorch/sdk/...", + "//executorch/devtools/...", ], deps = [ "//executorch/exir:scalar_type", @@ -24,11 +24,11 @@ runtime.python_library( "serialize.py", ], resources = { + "//executorch/devtools/etdump:etdump_schema_flatcc.fbs": "etdump_schema_flatcc.fbs", "//executorch/schema:scalar_type.fbs": "scalar_type.fbs", - "//executorch/sdk/etdump:etdump_schema_flatcc.fbs": "etdump_schema_flatcc.fbs", }, visibility = [ - "//executorch/sdk/...", + "//executorch/devtools/...", ], deps = [ "fbsource//third-party/pypi/setuptools:setuptools", diff --git a/sdk/etdump/emitter.cpp b/devtools/etdump/emitter.cpp similarity index 66% rename from sdk/etdump/emitter.cpp rename to devtools/etdump/emitter.cpp index 1b3cba9d196..653c75cb084 100644 --- a/sdk/etdump/emitter.cpp +++ b/devtools/etdump/emitter.cpp @@ -6,16 +6,25 @@ * LICENSE file in the root directory of this source tree. */ -#include +#include + #include +#include + +#include +#include + +#include -#include "executorch/runtime/platform/assert.h" -#include "executorch/sdk/etdump/emitter.h" +using executorch::etdump::internal::ETDumpStaticAllocator; -namespace torch { -namespace executor { +namespace executorch { +namespace etdump { +namespace internal { -static int _allocator_fn( +namespace { + +int allocator_fn( void* alloc_context, flatcc_iovec_t* b, size_t request, @@ -24,8 +33,8 @@ static int _allocator_fn( void* p; size_t n; - struct etdump_static_allocator* state = - (struct etdump_static_allocator*)alloc_context; + ETDumpStaticAllocator* state = + reinterpret_cast(alloc_context); // This allocator doesn't support freeing memory. if (request == 0) { @@ -113,14 +122,14 @@ static int _allocator_fn( // This emitter implementation emits to a fixed size buffer and will fail if it // runs out of room on either end. -static int _emitter_fn( +int emitter_fn( void* emit_context, const flatcc_iovec_t* iov, int iov_count, flatbuffers_soffset_t offset, size_t len) { - struct etdump_static_allocator* E = - (struct etdump_static_allocator*)emit_context; + ETDumpStaticAllocator* E = + reinterpret_cast(emit_context); uint8_t* p; if (offset < 0) { @@ -144,40 +153,15 @@ static int _emitter_fn( return 0; } -/******************************************************************************* - * Public Functions - ******************************************************************************/ - -int etdump_static_allocator_builder_init( - flatcc_builder_t* builder, - struct etdump_static_allocator* alloc) { - ET_CHECK(builder != nullptr); - ET_CHECK(alloc != nullptr); - - // Ensure data size is multiple of 32 (minimum allocation size). - ET_CHECK((alloc->data_size & 0x1F) == 0); - // Ensure out_size is divisable by 2 to ensure front/back sizes are equal for - // emitter.. - ET_CHECK((alloc->out_size & 0x1) == 0); - - return flatcc_builder_custom_init( - builder, _emitter_fn, alloc, _allocator_fn, alloc); -} - -void etdump_static_allocator_reset(struct etdump_static_allocator* alloc) { - ET_CHECK(alloc != nullptr); - alloc->allocated = 0; - size_t n = alloc->out_size / 2; - alloc->front_cursor = &alloc->data[alloc->data_size + n]; - alloc->front_left = n; -} +} // namespace -int et_flatcc_custom_init( +int etdump_flatcc_custom_init( flatcc_builder_t* builder, - struct etdump_static_allocator* alloc) { + struct ETDumpStaticAllocator* alloc) { return flatcc_builder_custom_init( - builder, _emitter_fn, alloc, _allocator_fn, alloc); + builder, emitter_fn, alloc, allocator_fn, alloc); } -} // namespace executor -} // namespace torch +} // namespace internal +} // namespace etdump +} // namespace executorch diff --git a/devtools/etdump/emitter.h b/devtools/etdump/emitter.h new file mode 100644 index 00000000000..09c1b56aa56 --- /dev/null +++ b/devtools/etdump/emitter.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +#include + +typedef struct flatcc_builder flatcc_builder_t; + +namespace executorch { +namespace etdump { +namespace internal { + +int etdump_flatcc_custom_init( + flatcc_builder_t* builder, + internal::ETDumpStaticAllocator* alloc); + +} // namespace internal +} // namespace etdump +} // namespace executorch diff --git a/sdk/etdump/etdump_flatcc.cpp b/devtools/etdump/etdump_flatcc.cpp similarity index 53% rename from sdk/etdump/etdump_flatcc.cpp rename to devtools/etdump/etdump_flatcc.cpp index dab1443b55f..4c05bb5acee 100644 --- a/sdk/etdump/etdump_flatcc.cpp +++ b/devtools/etdump/etdump_flatcc.cpp @@ -6,19 +6,33 @@ * LICENSE file in the root directory of this source tree. */ -#include "executorch/sdk/etdump/etdump_flatcc.h" -#include -#include +#include + +#include + +#include +#include +#include +#include +#include +#include + #include -#include -#include -#include "executorch/runtime/core/exec_aten/exec_aten.h" -#include "executorch/runtime/core/exec_aten/util/scalar_type_util.h" -#include "executorch/runtime/platform/assert.h" -#include "executorch/sdk/etdump/emitter.h" -namespace torch { -namespace executor { +using ::exec_aten::Tensor; +using ::executorch::runtime::AllocatorID; +using ::executorch::runtime::ArrayRef; +using ::executorch::runtime::ChainID; +using ::executorch::runtime::DebugHandle; +using ::executorch::runtime::DelegateDebugIdType; +using ::executorch::runtime::EValue; +using ::executorch::runtime::EventTracerEntry; +using ::executorch::runtime::LoggedEValueType; +using ::executorch::runtime::Span; +using ::executorch::runtime::Tag; + +namespace executorch { +namespace etdump { namespace { @@ -50,30 +64,30 @@ executorch_flatbuffer_ScalarType_enum_t get_flatbuffer_scalar_type( } etdump_Tensor_ref_t add_tensor_entry( - flatcc_builder_t* builder, + flatcc_builder_t* builder_, const exec_aten::Tensor& tensor, long offset) { - etdump_Tensor_start(builder); + etdump_Tensor_start(builder_); etdump_Tensor_scalar_type_add( - builder, get_flatbuffer_scalar_type(tensor.scalar_type())); - etdump_Tensor_sizes_start(builder); + builder_, get_flatbuffer_scalar_type(tensor.scalar_type())); + etdump_Tensor_sizes_start(builder_); for (auto dim : tensor.sizes()) { int64_t cast_dim = static_cast(dim); - etdump_Tensor_sizes_push(builder, &cast_dim); + etdump_Tensor_sizes_push(builder_, &cast_dim); } - etdump_Tensor_sizes_end(builder); + etdump_Tensor_sizes_end(builder_); - etdump_Tensor_strides_start(builder); + etdump_Tensor_strides_start(builder_); for (auto dim : tensor.strides()) { int64_t cast_dim = static_cast(dim); - etdump_Tensor_strides_push(builder, &cast_dim); + etdump_Tensor_strides_push(builder_, &cast_dim); } - etdump_Tensor_strides_end(builder); - etdump_Tensor_offset_add(builder, offset); + etdump_Tensor_strides_end(builder_); + etdump_Tensor_offset_add(builder_, offset); - return etdump_Tensor_end(builder); + return etdump_Tensor_end(builder_); } static uint8_t* alignPointer(void* ptr, size_t alignment) { @@ -88,71 +102,71 @@ static uint8_t* alignPointer(void* ptr, size_t alignment) { } // namespace -constexpr size_t max_alloc_buf_size = 128 * 1024; - // Constructor implementation ETDumpGen::ETDumpGen(Span buffer) { - // Initialize the flatcc builder using the buffer and buffer size. + constexpr size_t max_alloc_buf_size = 128 * 1024; + + // Initialize the flatcc builder_ using the buffer and buffer size. if (buffer.data() != nullptr) { - builder = (struct flatcc_builder*)alignPointer(buffer.data(), 64); + builder_ = (struct flatcc_builder*)alignPointer(buffer.data(), 64); uintptr_t buffer_with_builder = - (uintptr_t)alignPointer(builder + sizeof(struct flatcc_builder), 64); + (uintptr_t)alignPointer(builder_ + sizeof(struct flatcc_builder), 64); size_t buffer_size = buffer.size() - (size_t)(buffer_with_builder - (uintptr_t)buffer.data()); - alloc.set_buffer( + alloc_.set_buffer( (uint8_t*)buffer_with_builder, buffer_size, (size_t)((buffer_size / 4 > max_alloc_buf_size) ? max_alloc_buf_size : buffer_size / 4)); - et_flatcc_custom_init(builder, &alloc); + internal::etdump_flatcc_custom_init(builder_, &alloc_); } else { - builder = (struct flatcc_builder*)malloc(sizeof(struct flatcc_builder)); + builder_ = (struct flatcc_builder*)malloc(sizeof(struct flatcc_builder)); ET_CHECK_MSG( - builder != nullptr, "Failed to allocate memory for flatcc builder."); - flatcc_builder_init(builder); + builder_ != nullptr, "Failed to allocate memory for flatcc builder_."); + flatcc_builder_init(builder_); } reset(); } ETDumpGen::~ETDumpGen() { - flatcc_builder_clear(builder); + flatcc_builder_clear(builder_); if (!is_static_etdump()) { - free(builder); + free(builder_); } } void ETDumpGen::reset() { - etdump_gen_state = ETDumpGen_Init; - num_blocks = 0; - flatcc_builder_reset(builder); - flatbuffers_buffer_start(builder, etdump_ETDump_file_identifier); - etdump_ETDump_start_as_root_with_size(builder); - etdump_ETDump_version_add(builder, ETDUMP_VERSION); - etdump_ETDump_run_data_start(builder); - etdump_ETDump_run_data_push_start(builder); + state_ = State::Init; + num_blocks_ = 0; + flatcc_builder_reset(builder_); + flatbuffers_buffer_start(builder_, etdump_ETDump_file_identifier); + etdump_ETDump_start_as_root_with_size(builder_); + etdump_ETDump_version_add(builder_, ETDUMP_VERSION); + etdump_ETDump_run_data_start(builder_); + etdump_ETDump_run_data_push_start(builder_); } void ETDumpGen::create_event_block(const char* name) { - if (etdump_gen_state == ETDumpGen_Adding_Events) { - etdump_RunData_events_end(builder); - } else if (etdump_gen_state == ETDumpGen_Done) { + if (state_ == State::AddingEvents) { + etdump_RunData_events_end(builder_); + } else if (state_ == State::Done) { reset(); } - if (num_blocks > 0) { - etdump_ETDump_run_data_push_end(builder); - etdump_ETDump_run_data_push_start(builder); + if (num_blocks_ > 0) { + etdump_ETDump_run_data_push_end(builder_); + etdump_ETDump_run_data_push_start(builder_); } - ++num_blocks; - etdump_RunData_name_create_strn(builder, name, strlen(name)); - if (bundled_input_index != -1) { - etdump_RunData_bundled_input_index_add(builder, bundled_input_index); + ++num_blocks_; + etdump_RunData_name_create_strn(builder_, name, strlen(name)); + if (bundled_input_index_ != -1) { + etdump_RunData_bundled_input_index_add(builder_, bundled_input_index_); } - etdump_gen_state = ETDumpGen_Block_Created; + state_ = State::BlockCreated; } int64_t ETDumpGen::create_string_entry(const char* name) { - return flatbuffers_string_create_str(builder, name); + return flatbuffers_string_create_str(builder_, name); } // ETDumpGen has the following possible states, ETDumpGen_Init, @@ -169,16 +183,15 @@ int64_t ETDumpGen::create_string_entry(const char* name) { // type again. In this case once we close the allocators table and start pushing // to the events table we cannot push to the allocators table again. void ETDumpGen::check_ready_to_add_events() { - if (etdump_gen_state != ETDumpGen_Adding_Events) { + if (state_ != State::AddingEvents) { ET_CHECK_MSG( - (etdump_gen_state == ETDumpGen_Adding_Allocators || - etdump_gen_state == ETDumpGen_Block_Created), + (state_ == State::AddingAllocators || state_ == State::BlockCreated), "ETDumpGen in an invalid state. Cannot add new events now."); - if (etdump_gen_state == ETDumpGen_Adding_Allocators) { - etdump_RunData_allocators_end(builder); + if (state_ == State::AddingAllocators) { + etdump_RunData_allocators_end(builder_); } - etdump_RunData_events_start(builder); - etdump_gen_state = ETDumpGen_Adding_Events; + etdump_RunData_events_start(builder_); + state_ = State::AddingEvents; } } @@ -231,29 +244,29 @@ void ETDumpGen::end_profiling_delegate( check_ready_to_add_events(); // Start building the ProfileEvent entry. - etdump_ProfileEvent_start(builder); - etdump_ProfileEvent_start_time_add(builder, event_tracer_entry.start_time); - etdump_ProfileEvent_end_time_add(builder, end_time); - etdump_ProfileEvent_chain_index_add(builder, chain_id_); - etdump_ProfileEvent_instruction_id_add(builder, debug_handle_); + etdump_ProfileEvent_start(builder_); + etdump_ProfileEvent_start_time_add(builder_, event_tracer_entry.start_time); + etdump_ProfileEvent_end_time_add(builder_, end_time); + etdump_ProfileEvent_chain_index_add(builder_, chain_id_); + etdump_ProfileEvent_instruction_id_add(builder_, debug_handle_); // Delegate debug identifier can either be of a string type or an integer // type. If it's a string type then it's a value of type // flatbuffers_string_ref_t type, whereas if it's an integer type then we // write the integer value directly. if (event_tracer_entry.delegate_event_id_type == DelegateDebugIdType::kInt) { etdump_ProfileEvent_delegate_debug_id_int_add( - builder, event_tracer_entry.event_id); + builder_, event_tracer_entry.event_id); } else { etdump_ProfileEvent_delegate_debug_id_str_add( - builder, event_tracer_entry.event_id); + builder_, event_tracer_entry.event_id); } flatbuffers_uint8_vec_ref_t vec_ref = flatbuffers_uint8_vec_create_pe( - builder, (const uint8_t*)metadata, metadata_len); - etdump_ProfileEvent_delegate_debug_metadata_add(builder, vec_ref); - etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder); - etdump_RunData_events_push_start(builder); - etdump_Event_profile_event_add(builder, id); - etdump_RunData_events_push_end(builder); + builder_, (const uint8_t*)metadata, metadata_len); + etdump_ProfileEvent_delegate_debug_metadata_add(builder_, vec_ref); + etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder_); + etdump_RunData_events_push_start(builder_); + etdump_Event_profile_event_add(builder_, id); + etdump_RunData_events_push_end(builder_); } void ETDumpGen::log_profiling_delegate( @@ -268,24 +281,24 @@ void ETDumpGen::log_profiling_delegate( "Only name or delegate_debug_index can be valid. Check DelegateMappingBuilder documentation for more details."); check_ready_to_add_events(); int64_t string_id = name != nullptr ? create_string_entry(name) : -1; - etdump_ProfileEvent_start(builder); - etdump_ProfileEvent_start_time_add(builder, start_time); - etdump_ProfileEvent_end_time_add(builder, end_time); - etdump_ProfileEvent_chain_index_add(builder, chain_id_); - etdump_ProfileEvent_instruction_id_add(builder, debug_handle_); + etdump_ProfileEvent_start(builder_); + etdump_ProfileEvent_start_time_add(builder_, start_time); + etdump_ProfileEvent_end_time_add(builder_, end_time); + etdump_ProfileEvent_chain_index_add(builder_, chain_id_); + etdump_ProfileEvent_instruction_id_add(builder_, debug_handle_); if (string_id == -1) { etdump_ProfileEvent_delegate_debug_id_int_add( - builder, delegate_debug_index); + builder_, delegate_debug_index); } else { - etdump_ProfileEvent_delegate_debug_id_str_add(builder, string_id); + etdump_ProfileEvent_delegate_debug_id_str_add(builder_, string_id); } flatbuffers_uint8_vec_ref_t vec_ref = flatbuffers_uint8_vec_create_pe( - builder, (const uint8_t*)metadata, metadata_len); - etdump_ProfileEvent_delegate_debug_metadata_add(builder, vec_ref); - etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder); - etdump_RunData_events_push_start(builder); - etdump_Event_profile_event_add(builder, id); - etdump_RunData_events_push_end(builder); + builder_, (const uint8_t*)metadata, metadata_len); + etdump_ProfileEvent_delegate_debug_metadata_add(builder_, vec_ref); + etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder_); + etdump_RunData_events_push_start(builder_); + etdump_Event_profile_event_add(builder_, id); + etdump_RunData_events_push_end(builder_); } void ETDumpGen::log_intermediate_output_delegate( @@ -331,7 +344,7 @@ void ETDumpGen::log_intermediate_output_delegate_helper( ET_CHECK_MSG( (name == nullptr) ^ (delegate_debug_index == -1), "Only name or delegate_debug_index can be valid. Check DelegateMappingBuilder documentation for more details."); - if (debug_buffer.empty()) { + if (debug_buffer_.empty()) { ET_CHECK_MSG(0, "Must pre-set debug buffer with set_debug_buffer()\n"); return; } @@ -339,71 +352,71 @@ void ETDumpGen::log_intermediate_output_delegate_helper( check_ready_to_add_events(); int64_t string_id = name != nullptr ? create_string_entry(name) : -1; - etdump_DebugEvent_start(builder); + etdump_DebugEvent_start(builder_); - etdump_DebugEvent_chain_index_add(builder, chain_id_); - etdump_DebugEvent_instruction_id_add(builder, debug_handle_); + etdump_DebugEvent_chain_index_add(builder_, chain_id_); + etdump_DebugEvent_instruction_id_add(builder_, debug_handle_); if (string_id == -1) { - etdump_DebugEvent_delegate_debug_id_int_add(builder, delegate_debug_index); + etdump_DebugEvent_delegate_debug_id_int_add(builder_, delegate_debug_index); } else { - etdump_DebugEvent_delegate_debug_id_str_add(builder, string_id); + etdump_DebugEvent_delegate_debug_id_str_add(builder_, string_id); } // Check the type of `output` then call the corresponding logging functions if constexpr (std::is_same::value) { long offset = copy_tensor_to_debug_buffer(output); - etdump_Tensor_ref_t tensor_ref = add_tensor_entry(builder, output, offset); + etdump_Tensor_ref_t tensor_ref = add_tensor_entry(builder_, output, offset); - etdump_Value_start(builder); - etdump_Value_val_add(builder, etdump_ValueType_Tensor); - etdump_Value_tensor_add(builder, tensor_ref); + etdump_Value_start(builder_); + etdump_Value_val_add(builder_, etdump_ValueType_Tensor); + etdump_Value_tensor_add(builder_, tensor_ref); } else if constexpr (std::is_same>::value) { - etdump_Tensor_vec_start(builder); + etdump_Tensor_vec_start(builder_); for (size_t i = 0; i < output.size(); ++i) { long offset = copy_tensor_to_debug_buffer(output[i]); etdump_Tensor_vec_push( - builder, add_tensor_entry(builder, output[i], offset)); + builder_, add_tensor_entry(builder_, output[i], offset)); } - etdump_Tensor_vec_ref_t tensor_vec_ref = etdump_Tensor_vec_end(builder); + etdump_Tensor_vec_ref_t tensor_vec_ref = etdump_Tensor_vec_end(builder_); etdump_TensorList_ref_t tensor_list_ref = - etdump_TensorList_create(builder, tensor_vec_ref); + etdump_TensorList_create(builder_, tensor_vec_ref); - etdump_Value_start(builder); - etdump_Value_val_add(builder, etdump_ValueType_TensorList); - etdump_Value_tensor_list_add(builder, tensor_list_ref); + etdump_Value_start(builder_); + etdump_Value_val_add(builder_, etdump_ValueType_TensorList); + etdump_Value_tensor_list_add(builder_, tensor_list_ref); } else if constexpr (std::is_same::value) { - auto int_ref = etdump_Int_create(builder, output); + auto int_ref = etdump_Int_create(builder_, output); - etdump_Value_start(builder); - etdump_Value_val_add(builder, etdump_ValueType_Int); - etdump_Value_int_value_add(builder, int_ref); + etdump_Value_start(builder_); + etdump_Value_val_add(builder_, etdump_ValueType_Int); + etdump_Value_int_value_add(builder_, int_ref); } else if constexpr (std::is_same::value) { - auto double_ref = etdump_Double_create(builder, output); + auto double_ref = etdump_Double_create(builder_, output); - etdump_Value_start(builder); - etdump_Value_double_value_add(builder, double_ref); - etdump_Value_val_add(builder, etdump_ValueType_Double); + etdump_Value_start(builder_); + etdump_Value_double_value_add(builder_, double_ref); + etdump_Value_val_add(builder_, etdump_ValueType_Double); } else if constexpr (std::is_same::value) { flatbuffers_bool_t flatbuffer_bool_val = output ? FLATBUFFERS_TRUE : FLATBUFFERS_FALSE; - auto bool_ref = etdump_Bool_create(builder, flatbuffer_bool_val); + auto bool_ref = etdump_Bool_create(builder_, flatbuffer_bool_val); - etdump_Value_start(builder); - etdump_Value_bool_value_add(builder, bool_ref); - etdump_Value_val_add(builder, etdump_ValueType_Bool); + etdump_Value_start(builder_); + etdump_Value_bool_value_add(builder_, bool_ref); + etdump_Value_val_add(builder_, etdump_ValueType_Bool); } else { ET_CHECK_MSG(0, "Unsupported output type for intermediate logging\n"); } - auto value_ref = etdump_Value_end(builder); - etdump_DebugEvent_debug_entry_add(builder, value_ref); + auto value_ref = etdump_Value_end(builder_); + etdump_DebugEvent_debug_entry_add(builder_, value_ref); - etdump_DebugEvent_ref_t debug_event = etdump_DebugEvent_end(builder); + etdump_DebugEvent_ref_t debug_event = etdump_DebugEvent_end(builder_); - etdump_RunData_events_push_start(builder); - etdump_Event_debug_event_add(builder, debug_event); - etdump_RunData_events_push_end(builder); + etdump_RunData_events_push_start(builder_); + etdump_Event_debug_event_add(builder_, debug_event); + etdump_RunData_events_push_end(builder_); } void ETDumpGen::end_profiling(EventTracerEntry prof_entry) { @@ -413,32 +426,31 @@ void ETDumpGen::end_profiling(EventTracerEntry prof_entry) { "Delegate events must use end_profiling_delegate to mark the end of a delegate profiling event."); check_ready_to_add_events(); - etdump_ProfileEvent_start(builder); - etdump_ProfileEvent_start_time_add(builder, prof_entry.start_time); - etdump_ProfileEvent_end_time_add(builder, end_time); - etdump_ProfileEvent_chain_index_add(builder, prof_entry.chain_id); - etdump_ProfileEvent_instruction_id_add(builder, prof_entry.debug_handle); + etdump_ProfileEvent_start(builder_); + etdump_ProfileEvent_start_time_add(builder_, prof_entry.start_time); + etdump_ProfileEvent_end_time_add(builder_, end_time); + etdump_ProfileEvent_chain_index_add(builder_, prof_entry.chain_id); + etdump_ProfileEvent_instruction_id_add(builder_, prof_entry.debug_handle); if (prof_entry.event_id != -1) { - etdump_ProfileEvent_name_add(builder, prof_entry.event_id); + etdump_ProfileEvent_name_add(builder_, prof_entry.event_id); } - etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder); - etdump_RunData_events_push_start(builder); - etdump_Event_profile_event_add(builder, id); - etdump_RunData_events_push_end(builder); + etdump_ProfileEvent_ref_t id = etdump_ProfileEvent_end(builder_); + etdump_RunData_events_push_start(builder_); + etdump_Event_profile_event_add(builder_, id); + etdump_RunData_events_push_end(builder_); } AllocatorID ETDumpGen::track_allocator(const char* name) { ET_CHECK_MSG( - (etdump_gen_state == ETDumpGen_Block_Created || - etdump_gen_state == ETDumpGen_Adding_Allocators), + (state_ == State::BlockCreated || state_ == State::AddingAllocators), "Allocators can only be added immediately after a new block is created and before any events are added."); - if (etdump_gen_state != ETDumpGen_Adding_Allocators) { - etdump_RunData_allocators_start(builder); - etdump_gen_state = ETDumpGen_Adding_Allocators; + if (state_ != State::AddingAllocators) { + etdump_RunData_allocators_start(builder_); + state_ = State::AddingAllocators; } flatbuffers_string_ref_t ref = create_string_entry(name); - etdump_RunData_allocators_push_create(builder, ref); - return etdump_RunData_allocators_reserved_len(builder); + etdump_RunData_allocators_push_create(builder_, ref); + return etdump_RunData_allocators_reserved_len(builder_); } void ETDumpGen::track_allocation( @@ -446,43 +458,43 @@ void ETDumpGen::track_allocation( size_t allocation_size) { check_ready_to_add_events(); - etdump_RunData_events_push_start(builder); - etdump_Event_allocation_event_create(builder, allocator_id, allocation_size); - etdump_RunData_events_push_end(builder); + etdump_RunData_events_push_start(builder_); + etdump_Event_allocation_event_create(builder_, allocator_id, allocation_size); + etdump_RunData_events_push_end(builder_); } -etdump_result ETDumpGen::get_etdump_data() { - etdump_result result; - if (etdump_gen_state == ETDumpGen_Adding_Events) { - etdump_RunData_events_end(builder); - } else if (etdump_gen_state == ETDumpGen_Adding_Allocators) { - etdump_RunData_allocators_end(builder); - } else if (etdump_gen_state == ETDumpGen_Init) { +ETDumpResult ETDumpGen::get_etdump_data() { + ETDumpResult result; + if (state_ == State::AddingEvents) { + etdump_RunData_events_end(builder_); + } else if (state_ == State::AddingAllocators) { + etdump_RunData_allocators_end(builder_); + } else if (state_ == State::Init) { result.buf = nullptr; result.size = 0; return result; } - etdump_ETDump_run_data_push_end(builder); - etdump_ETDump_run_data_end(builder); - etdump_ETDump_ref_t root = etdump_ETDump_end(builder); - flatbuffers_buffer_end(builder, root); - if (num_blocks == 0) { + etdump_ETDump_run_data_push_end(builder_); + etdump_ETDump_run_data_end(builder_); + etdump_ETDump_ref_t root = etdump_ETDump_end(builder_); + flatbuffers_buffer_end(builder_, root); + if (num_blocks_ == 0) { result = {nullptr, 0}; } else { - if (alloc.data) { - result.buf = alloc.front_cursor; - result.size = alloc.out_size - alloc.front_left; + if (alloc_.data) { + result.buf = alloc_.front_cursor; + result.size = alloc_.out_size - alloc_.front_left; } else { result.buf = - flatcc_builder_finalize_aligned_buffer(builder, &result.size); + flatcc_builder_finalize_aligned_buffer(builder_, &result.size); } } - etdump_gen_state = ETDumpGen_Done; + state_ = State::Done; return result; } void ETDumpGen::set_debug_buffer(Span buffer) { - debug_buffer = buffer; + debug_buffer_ = buffer; } size_t ETDumpGen::copy_tensor_to_debug_buffer(exec_aten::Tensor tensor) { @@ -490,94 +502,94 @@ size_t ETDumpGen::copy_tensor_to_debug_buffer(exec_aten::Tensor tensor) { return static_cast(-1); } uint8_t* offset_ptr = - alignPointer(debug_buffer.data() + debug_buffer_offset, 64); - debug_buffer_offset = (offset_ptr - debug_buffer.data()) + tensor.nbytes(); + alignPointer(debug_buffer_.data() + debug_buffer_offset_, 64); + debug_buffer_offset_ = (offset_ptr - debug_buffer_.data()) + tensor.nbytes(); ET_CHECK_MSG( - debug_buffer_offset <= debug_buffer.size(), + debug_buffer_offset_ <= debug_buffer_.size(), "Ran out of space to store intermediate outputs."); memcpy(offset_ptr, tensor.const_data_ptr(), tensor.nbytes()); - return (size_t)(offset_ptr - debug_buffer.data()); + return (size_t)(offset_ptr - debug_buffer_.data()); } void ETDumpGen::log_evalue(const EValue& evalue, LoggedEValueType evalue_type) { - if (debug_buffer.empty()) { + if (debug_buffer_.empty()) { return; } check_ready_to_add_events(); - etdump_DebugEvent_start(builder); + etdump_DebugEvent_start(builder_); - etdump_DebugEvent_chain_index_add(builder, chain_id_); - etdump_DebugEvent_instruction_id_add(builder, debug_handle_); + etdump_DebugEvent_chain_index_add(builder_, chain_id_); + etdump_DebugEvent_instruction_id_add(builder_, debug_handle_); switch (evalue.tag) { case Tag::Tensor: { exec_aten::Tensor tensor = evalue.toTensor(); long offset = copy_tensor_to_debug_buffer(tensor); etdump_Tensor_ref_t tensor_ref = - add_tensor_entry(builder, tensor, offset); + add_tensor_entry(builder_, tensor, offset); - etdump_Value_start(builder); - etdump_Value_val_add(builder, etdump_ValueType_Tensor); - etdump_Value_tensor_add(builder, tensor_ref); + etdump_Value_start(builder_); + etdump_Value_val_add(builder_, etdump_ValueType_Tensor); + etdump_Value_tensor_add(builder_, tensor_ref); if (evalue_type == LoggedEValueType::kProgramOutput) { - auto bool_ref = etdump_Bool_create(builder, FLATBUFFERS_TRUE); - etdump_Value_output_add(builder, bool_ref); + auto bool_ref = etdump_Bool_create(builder_, FLATBUFFERS_TRUE); + etdump_Value_output_add(builder_, bool_ref); } - auto value_ref = etdump_Value_end(builder); + auto value_ref = etdump_Value_end(builder_); - etdump_DebugEvent_debug_entry_add(builder, value_ref); + etdump_DebugEvent_debug_entry_add(builder_, value_ref); break; } case Tag::ListTensor: { exec_aten::ArrayRef tensors = evalue.toTensorList(); - etdump_Tensor_vec_start(builder); + etdump_Tensor_vec_start(builder_); for (size_t i = 0; i < tensors.size(); ++i) { long offset = copy_tensor_to_debug_buffer(tensors[i]); etdump_Tensor_vec_push( - builder, add_tensor_entry(builder, tensors[i], offset)); + builder_, add_tensor_entry(builder_, tensors[i], offset)); } - etdump_Tensor_vec_ref_t tensor_vec_ref = etdump_Tensor_vec_end(builder); + etdump_Tensor_vec_ref_t tensor_vec_ref = etdump_Tensor_vec_end(builder_); etdump_TensorList_ref_t tensor_list_ref = - etdump_TensorList_create(builder, tensor_vec_ref); + etdump_TensorList_create(builder_, tensor_vec_ref); - etdump_Value_start(builder); - etdump_Value_val_add(builder, etdump_ValueType_TensorList); - etdump_Value_tensor_list_add(builder, tensor_list_ref); + etdump_Value_start(builder_); + etdump_Value_val_add(builder_, etdump_ValueType_TensorList); + etdump_Value_tensor_list_add(builder_, tensor_list_ref); if (evalue_type == LoggedEValueType::kProgramOutput) { - auto bool_ref = etdump_Bool_create(builder, FLATBUFFERS_TRUE); - etdump_Value_output_add(builder, bool_ref); + auto bool_ref = etdump_Bool_create(builder_, FLATBUFFERS_TRUE); + etdump_Value_output_add(builder_, bool_ref); } - auto value_ref = etdump_Value_end(builder); + auto value_ref = etdump_Value_end(builder_); - etdump_DebugEvent_debug_entry_add(builder, value_ref); + etdump_DebugEvent_debug_entry_add(builder_, value_ref); break; } case Tag::Int: { int64_t val = evalue.toInt(); - auto int_ref = etdump_Int_create(builder, val); + auto int_ref = etdump_Int_create(builder_, val); - etdump_Value_start(builder); - etdump_Value_val_add(builder, etdump_ValueType_Int); - etdump_Value_int_value_add(builder, int_ref); - auto value_ref = etdump_Value_end(builder); - etdump_DebugEvent_debug_entry_add(builder, value_ref); + etdump_Value_start(builder_); + etdump_Value_val_add(builder_, etdump_ValueType_Int); + etdump_Value_int_value_add(builder_, int_ref); + auto value_ref = etdump_Value_end(builder_); + etdump_DebugEvent_debug_entry_add(builder_, value_ref); break; } case Tag::Double: { double val = evalue.toDouble(); - auto double_ref = etdump_Double_create(builder, val); + auto double_ref = etdump_Double_create(builder_, val); - etdump_Value_start(builder); - etdump_Value_double_value_add(builder, double_ref); - etdump_Value_val_add(builder, etdump_ValueType_Double); - auto value_ref = etdump_Value_end(builder); - etdump_DebugEvent_debug_entry_add(builder, value_ref); + etdump_Value_start(builder_); + etdump_Value_double_value_add(builder_, double_ref); + etdump_Value_val_add(builder_, etdump_ValueType_Double); + auto value_ref = etdump_Value_end(builder_); + etdump_DebugEvent_debug_entry_add(builder_, value_ref); break; } @@ -585,13 +597,13 @@ void ETDumpGen::log_evalue(const EValue& evalue, LoggedEValueType evalue_type) { case Tag::Bool: { flatbuffers_bool_t flatbuffer_bool_val = evalue.toBool() ? FLATBUFFERS_TRUE : FLATBUFFERS_FALSE; - auto bool_ref = etdump_Bool_create(builder, flatbuffer_bool_val); + auto bool_ref = etdump_Bool_create(builder_, flatbuffer_bool_val); - etdump_Value_start(builder); - etdump_Value_bool_value_add(builder, bool_ref); - etdump_Value_val_add(builder, etdump_ValueType_Bool); - auto value_ref = etdump_Value_end(builder); - etdump_DebugEvent_debug_entry_add(builder, value_ref); + etdump_Value_start(builder_); + etdump_Value_bool_value_add(builder_, bool_ref); + etdump_Value_val_add(builder_, etdump_ValueType_Bool); + auto value_ref = etdump_Value_end(builder_); + etdump_DebugEvent_debug_entry_add(builder_, value_ref); break; } @@ -604,20 +616,20 @@ void ETDumpGen::log_evalue(const EValue& evalue, LoggedEValueType evalue_type) { break; } - etdump_DebugEvent_ref_t debug_event = etdump_DebugEvent_end(builder); + etdump_DebugEvent_ref_t debug_event = etdump_DebugEvent_end(builder_); - etdump_RunData_events_push_start(builder); - etdump_Event_debug_event_add(builder, debug_event); - etdump_RunData_events_push_end(builder); + etdump_RunData_events_push_start(builder_); + etdump_Event_debug_event_add(builder_, debug_event); + etdump_RunData_events_push_end(builder_); } size_t ETDumpGen::get_num_blocks() { - return num_blocks; + return num_blocks_; } bool ETDumpGen::is_static_etdump() { - return alloc.data != nullptr; + return alloc_.data != nullptr; } -} // namespace executor -} // namespace torch +} // namespace etdump +} // namespace executorch diff --git a/sdk/etdump/etdump_flatcc.h b/devtools/etdump/etdump_flatcc.h similarity index 53% rename from sdk/etdump/etdump_flatcc.h rename to devtools/etdump/etdump_flatcc.h index e56d09f8107..0bd891a0970 100644 --- a/sdk/etdump/etdump_flatcc.h +++ b/devtools/etdump/etdump_flatcc.h @@ -8,33 +8,22 @@ #pragma once -#include #include -#include "executorch/runtime/core/event_tracer.h" -#include "executorch/runtime/platform/platform.h" + +#include +#include +#include #define ETDUMP_VERSION 0 struct flatcc_builder; -namespace torch { -namespace executor { - -enum ETDumpGen_State { - ETDumpGen_Init, - ETDumpGen_Block_Created, - ETDumpGen_Adding_Allocators, - ETDumpGen_Adding_Events, - ETDumpGen_Done, -}; +namespace executorch { +namespace etdump { -struct etdump_result { - void* buf; - size_t size; -}; - -struct etdump_static_allocator { - etdump_static_allocator() {} +namespace internal { +struct ETDumpStaticAllocator { + ETDumpStaticAllocator() = default; void set_buffer(uint8_t* buffer, size_t total_buf_size, size_t alloc_buf_size) { @@ -64,61 +53,72 @@ struct etdump_static_allocator { // Bytes left in front of front_cursor. size_t front_left{0}; }; +} // namespace internal + +struct ETDumpResult { + void* buf; + size_t size; +}; -class ETDumpGen : public EventTracer { +class ETDumpGen : public ::executorch::runtime::EventTracer { public: - ETDumpGen(Span buffer = {nullptr, (size_t)0}); + ETDumpGen(::executorch::runtime::Span buffer = {nullptr, (size_t)0}); ~ETDumpGen() override; void clear_builder(); void create_event_block(const char* name) override; - virtual EventTracerEntry start_profiling( + virtual ::executorch::runtime::EventTracerEntry start_profiling( const char* name, - ChainID chain_id = -1, - DebugHandle debug_handle = 0) override; - virtual void end_profiling(EventTracerEntry prof_entry) override; - virtual EventTracerEntry start_profiling_delegate( + ::executorch::runtime::ChainID chain_id = -1, + ::executorch::runtime::DebugHandle debug_handle = 0) override; + virtual void end_profiling( + ::executorch::runtime::EventTracerEntry prof_entry) override; + virtual ::executorch::runtime::EventTracerEntry start_profiling_delegate( const char* name, - DebugHandle delegate_debug_index) override; + ::executorch::runtime::DebugHandle delegate_debug_index) override; virtual void end_profiling_delegate( - EventTracerEntry prof_entry, + ::executorch::runtime::EventTracerEntry prof_entry, const void* metadata, size_t metadata_len) override; virtual void log_profiling_delegate( const char* name, - DebugHandle delegate_debug_index, + ::executorch::runtime::DebugHandle delegate_debug_index, et_timestamp_t start_time, et_timestamp_t end_time, const void* metadata, size_t metadata_len) override; - virtual void track_allocation(AllocatorID id, size_t size) override; - virtual AllocatorID track_allocator(const char* name) override; + virtual void track_allocation( + ::executorch::runtime::AllocatorID id, + size_t size) override; + virtual ::executorch::runtime::AllocatorID track_allocator( + const char* name) override; virtual void log_evalue( - const EValue& evalue, - LoggedEValueType evalue_type = - LoggedEValueType::kIntermediateOutput) override; + const ::executorch::runtime::EValue& evalue, + ::executorch::runtime::LoggedEValueType evalue_type = + ::executorch::runtime::LoggedEValueType::kIntermediateOutput) + override; /** * Log an intermediate tensor output from a delegate. */ virtual void log_intermediate_output_delegate( const char* name, - DebugHandle delegate_debug_index, - const Tensor& output) override; + ::executorch::runtime::DebugHandle delegate_debug_index, + const exec_aten::Tensor& output) override; /** * Log an intermediate tensor array output from a delegate. */ virtual void log_intermediate_output_delegate( const char* name, - DebugHandle delegate_debug_index, - const ArrayRef output) override; + ::executorch::runtime::DebugHandle delegate_debug_index, + const ::executorch::runtime::ArrayRef output) override; /** * Log an intermediate int output from a delegate. */ virtual void log_intermediate_output_delegate( const char* name, - DebugHandle delegate_debug_index, + ::executorch::runtime::DebugHandle delegate_debug_index, const int& output) override; /** @@ -126,7 +126,7 @@ class ETDumpGen : public EventTracer { */ virtual void log_intermediate_output_delegate( const char* name, - DebugHandle delegate_debug_index, + ::executorch::runtime::DebugHandle delegate_debug_index, const bool& output) override; /** @@ -134,22 +134,22 @@ class ETDumpGen : public EventTracer { */ virtual void log_intermediate_output_delegate( const char* name, - DebugHandle delegate_debug_index, + ::executorch::runtime::DebugHandle delegate_debug_index, const double& output) override; - void set_debug_buffer(Span buffer); - etdump_result get_etdump_data(); + void set_debug_buffer(::executorch::runtime::Span buffer); + ETDumpResult get_etdump_data(); size_t get_num_blocks(); bool is_static_etdump(); void reset(); private: - struct flatcc_builder* builder; - size_t num_blocks = 0; - Span debug_buffer; - size_t debug_buffer_offset = 0; - int bundled_input_index = -1; - ETDumpGen_State etdump_gen_state = ETDumpGen_Init; - struct etdump_static_allocator alloc; + enum class State { + Init, + BlockCreated, + AddingAllocators, + AddingEvents, + Done, + }; void check_ready_to_add_events(); int64_t create_string_entry(const char* name); @@ -162,9 +162,26 @@ class ETDumpGen : public EventTracer { template void log_intermediate_output_delegate_helper( const char* name, - DebugHandle delegate_debug_index, + ::executorch::runtime::DebugHandle delegate_debug_index, const T& output); + + struct flatcc_builder* builder_; + size_t num_blocks_ = 0; + ::executorch::runtime::Span debug_buffer_; + size_t debug_buffer_offset_ = 0; + int bundled_input_index_ = -1; + State state_ = State::Init; + struct internal::ETDumpStaticAllocator alloc_; }; +} // namespace etdump +} // namespace executorch + +namespace torch { +namespace executor { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using etdump_result = ::executorch::etdump::ETDumpResult; +using ::executorch::etdump::ETDumpGen; } // namespace executor } // namespace torch diff --git a/sdk/etdump/etdump_schema_flatcc.fbs b/devtools/etdump/etdump_schema_flatcc.fbs similarity index 96% rename from sdk/etdump/etdump_schema_flatcc.fbs rename to devtools/etdump/etdump_schema_flatcc.fbs index d90d278f5fc..1244ebd4aeb 100644 --- a/sdk/etdump/etdump_schema_flatcc.fbs +++ b/devtools/etdump/etdump_schema_flatcc.fbs @@ -76,6 +76,10 @@ table DebugEvent { // String based delegate debug identifier. delegate_debug_id_str:string; + + // Name assigned to this debug event by the runtime. If it is an operator + // call this will just be the name of the operator that was executed. + name:string; } // All the details pertaining to an allocation done in the runtime. The main diff --git a/sdk/etdump/scalar_type.fbs b/devtools/etdump/scalar_type.fbs similarity index 97% rename from sdk/etdump/scalar_type.fbs rename to devtools/etdump/scalar_type.fbs index fdfe550e9e3..a8da080c679 100644 --- a/sdk/etdump/scalar_type.fbs +++ b/devtools/etdump/scalar_type.fbs @@ -14,6 +14,7 @@ enum ScalarType : byte { SHORT = 2, INT = 3, LONG = 4, + HALF = 5, FLOAT = 6, DOUBLE = 7, BOOL = 11, @@ -24,7 +25,6 @@ enum ScalarType : byte { QUINT4X2 = 16, QUINT2X4 = 17, // Types currently not implemented. - // Half = 5, // COMPLEXHALF = 8, // COMPLEXFLOAT = 9, // COMPLEXDOUBLE = 10, diff --git a/sdk/etdump/schema_flatcc.py b/devtools/etdump/schema_flatcc.py similarity index 96% rename from sdk/etdump/schema_flatcc.py rename to devtools/etdump/schema_flatcc.py index eaad876a536..404fa1c9758 100644 --- a/sdk/etdump/schema_flatcc.py +++ b/devtools/etdump/schema_flatcc.py @@ -7,7 +7,7 @@ # pyre-strict """ This file is the python representation of the schema contained in -executorch/sdk/etdump/etdump_schema.fbs. Any changes made to that +executorch/devtools/etdump/etdump_schema.fbs. Any changes made to that flatbuffer schema should accordingly be reflected here also. """ @@ -93,6 +93,7 @@ class Value: @dataclass class DebugEvent: + name: Optional[str] chain_index: int instruction_id: int delegate_debug_id_int: Optional[int] diff --git a/sdk/etdump/serialize.py b/devtools/etdump/serialize.py similarity index 98% rename from sdk/etdump/serialize.py rename to devtools/etdump/serialize.py index 0cc6682bfcb..4ed63bc385b 100644 --- a/sdk/etdump/serialize.py +++ b/devtools/etdump/serialize.py @@ -11,11 +11,11 @@ import tempfile import pkg_resources +from executorch.devtools.etdump.schema_flatcc import ETDumpFlatCC from executorch.exir._serialize._dataclass import _DataclassEncoder, _json_to_dataclass from executorch.exir._serialize._flatbuffer import _flatc_compile, _flatc_decompile -from executorch.sdk.etdump.schema_flatcc import ETDumpFlatCC # The prefix of schema files used for etdump ETDUMP_FLATCC_SCHEMA_NAME = "etdump_schema_flatcc" diff --git a/sdk/etdump/targets.bzl b/devtools/etdump/targets.bzl similarity index 99% rename from sdk/etdump/targets.bzl rename to devtools/etdump/targets.bzl index 6d548ce650f..ddbb35eab74 100644 --- a/sdk/etdump/targets.bzl +++ b/devtools/etdump/targets.bzl @@ -95,9 +95,11 @@ def define_common_targets(): "etdump_flatcc.cpp", "emitter.cpp", ], + headers = [ + "emitter.h", + ], exported_headers = [ "etdump_flatcc.h", - "emitter.h", ], deps = [ "//executorch/runtime/platform:platform", diff --git a/sdk/etdump/tests/CMakeLists.txt b/devtools/etdump/tests/CMakeLists.txt similarity index 100% rename from sdk/etdump/tests/CMakeLists.txt rename to devtools/etdump/tests/CMakeLists.txt diff --git a/sdk/etdump/tests/TARGETS b/devtools/etdump/tests/TARGETS similarity index 75% rename from sdk/etdump/tests/TARGETS rename to devtools/etdump/tests/TARGETS index ad48948c48a..51e807891df 100644 --- a/sdk/etdump/tests/TARGETS +++ b/devtools/etdump/tests/TARGETS @@ -11,8 +11,8 @@ python_unittest( "serialize_test.py", ], deps = [ + "//executorch/devtools/etdump:schema_flatcc", + "//executorch/devtools/etdump:serialize", "//executorch/exir/_serialize:lib", - "//executorch/sdk/etdump:schema_flatcc", - "//executorch/sdk/etdump:serialize", ], ) diff --git a/sdk/etdump/tests/etdump_test.cpp b/devtools/etdump/tests/etdump_test.cpp similarity index 94% rename from sdk/etdump/tests/etdump_test.cpp rename to devtools/etdump/tests/etdump_test.cpp index d30bd9a3037..b750e21eb07 100644 --- a/sdk/etdump/tests/etdump_test.cpp +++ b/devtools/etdump/tests/etdump_test.cpp @@ -9,19 +9,31 @@ #include #include +#include +#include +#include #include #include #include -#include -#include -#include #include #include #include #include -namespace torch { -namespace executor { +using ::exec_aten::ScalarType; +using ::exec_aten::Tensor; +using ::executorch::etdump::ETDumpGen; +using ::executorch::etdump::ETDumpResult; +using ::executorch::runtime::AllocatorID; +using ::executorch::runtime::ArrayRef; +using ::executorch::runtime::BoxedEvalueList; +using ::executorch::runtime::DelegateDebugIdType; +using ::executorch::runtime::EValue; +using ::executorch::runtime::EventTracerEntry; +using ::executorch::runtime::LoggedEValueType; +using ::executorch::runtime::Span; +using ::executorch::runtime::Tag; +using ::executorch::runtime::testing::TensorFactory; class ProfilerETDumpTest : public ::testing::Test { protected: @@ -49,7 +61,7 @@ TEST_F(ProfilerETDumpTest, SingleProfileEvent) { EventTracerEntry entry = etdump_gen[i]->start_profiling("test_event", 0, 1); etdump_gen[i]->end_profiling(entry); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -105,7 +117,7 @@ TEST_F(ProfilerETDumpTest, EmptyBlocks) { etdump_gen[i]->start_profiling("test_event_1", 0, 1); etdump_gen[i]->end_profiling(entry); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -160,7 +172,7 @@ TEST_F(ProfilerETDumpTest, AllocationEvents) { TEST_F(ProfilerETDumpTest, DebugEvent) { for (size_t i = 0; i < 2; i++) { - testing::TensorFactory tf; + TensorFactory tf; EValue evalue(tf.ones({3, 2})); etdump_gen[i]->create_event_block("test_block"); @@ -189,7 +201,7 @@ TEST_F(ProfilerETDumpTest, DebugEvent) { TEST_F(ProfilerETDumpTest, DebugEventTensorList) { for (size_t i = 0; i < 2; i++) { - testing::TensorFactory tf; + TensorFactory tf; exec_aten::Tensor storage[2] = {tf.ones({3, 2}), tf.ones({3, 2})}; EValue evalue_1(storage[0]); EValue evalue_2(storage[1]); @@ -212,7 +224,7 @@ TEST_F(ProfilerETDumpTest, DebugEventTensorList) { } TEST_F(ProfilerETDumpTest, VerifyLogging) { - testing::TensorFactory tf; + TensorFactory tf; EValue evalue(tf.ones({3, 2})); for (size_t i = 0; i < 2; i++) { @@ -225,7 +237,7 @@ TEST_F(ProfilerETDumpTest, VerifyLogging) { etdump_gen[i]->log_evalue(evalue); etdump_gen[i]->log_evalue(evalue, LoggedEValueType::kProgramOutput); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -297,7 +309,7 @@ TEST_F(ProfilerETDumpTest, MultipleBlocksWithEvents) { entry = etdump_gen[i]->start_profiling("test_event", 0, 1); etdump_gen[i]->end_profiling(entry); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -363,7 +375,7 @@ TEST_F(ProfilerETDumpTest, VerifyData) { entry = etdump_gen[i]->start_profiling("test_event2", 0, 1); etdump_gen[i]->end_profiling(entry); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -421,7 +433,7 @@ TEST_F(ProfilerETDumpTest, LogDelegateIntermediateOutput) { Span buffer((uint8_t*)ptr, 2048); etdump_gen[i]->create_event_block("test_block"); - testing::TensorFactory tf; + TensorFactory tf; ET_EXPECT_DEATH( etdump_gen[i]->log_intermediate_output_delegate( @@ -462,7 +474,7 @@ TEST_F(ProfilerETDumpTest, LogDelegateIntermediateOutput) { static_cast(-1), true); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -474,7 +486,7 @@ TEST_F(ProfilerETDumpTest, LogDelegateIntermediateOutput) { } TEST_F(ProfilerETDumpTest, VerifyDelegateIntermediateLogging) { - testing::TensorFactory tf; + TensorFactory tf; EValue evalue(tf.ones({3, 2})); for (size_t i = 0; i < 2; i++) { @@ -492,7 +504,7 @@ TEST_F(ProfilerETDumpTest, VerifyDelegateIntermediateLogging) { etdump_gen[i]->log_intermediate_output_delegate( nullptr, 258, tf.ones({5, 6})); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -603,7 +615,7 @@ TEST_F(ProfilerETDumpTest, LogDelegateEvents) { etdump_gen[i]->end_profiling(entry), "Delegate events must use end_profiling_delegate to mark the end of a delegate profiling event."); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -681,7 +693,7 @@ TEST_F(ProfilerETDumpTest, WriteAfterGetETDumpData) { etdump_gen[i]->start_profiling("test_event", 0, 1); etdump_gen[i]->end_profiling(entry); - etdump_result result = etdump_gen[i]->get_etdump_data(); + ETDumpResult result = etdump_gen[i]->get_etdump_data(); ASSERT_TRUE(result.buf != nullptr); ASSERT_TRUE(result.size != 0); @@ -712,6 +724,3 @@ TEST_F(ProfilerETDumpTest, WriteAfterGetETDumpData) { } } } - -} // namespace executor -} // namespace torch diff --git a/sdk/etdump/tests/serialize_test.py b/devtools/etdump/tests/serialize_test.py similarity index 96% rename from sdk/etdump/tests/serialize_test.py rename to devtools/etdump/tests/serialize_test.py index 2b1497f5974..5cab3e5b2ba 100644 --- a/sdk/etdump/tests/serialize_test.py +++ b/devtools/etdump/tests/serialize_test.py @@ -12,13 +12,13 @@ from pprint import pformat from typing import List -import executorch.sdk.etdump.schema_flatcc as flatcc -from executorch.exir._serialize._dataclass import _DataclassEncoder +import executorch.devtools.etdump.schema_flatcc as flatcc -from executorch.sdk.etdump.serialize import ( +from executorch.devtools.etdump.serialize import ( deserialize_from_etdump_flatcc, serialize_to_etdump_flatcc, ) +from executorch.exir._serialize._dataclass import _DataclassEncoder def diff_jsons(a: str, b: str) -> List[str]: @@ -83,6 +83,7 @@ def get_sample_etdump_flatcc() -> flatcc.ETDumpFlatCC: profile_event=None, allocation_event=None, debug_event=flatcc.DebugEvent( + name="test_debug_event", chain_index=1, instruction_id=0, delegate_debug_id_str="56", diff --git a/sdk/etdump/tests/targets.bzl b/devtools/etdump/tests/targets.bzl similarity index 82% rename from sdk/etdump/tests/targets.bzl rename to devtools/etdump/tests/targets.bzl index 41b19ca65ef..5299b7c1cb7 100644 --- a/sdk/etdump/tests/targets.bzl +++ b/devtools/etdump/tests/targets.bzl @@ -13,8 +13,8 @@ def define_common_targets(): "etdump_test.cpp", ], deps = [ - "//executorch/sdk/etdump:etdump_flatcc", - "//executorch/sdk/etdump:etdump_schema_flatcc", + "//executorch/devtools/etdump:etdump_flatcc", + "//executorch/devtools/etdump:etdump_schema_flatcc", "//executorch/runtime/platform:platform", "//executorch/runtime/core/exec_aten/testing_util:tensor_util", ], diff --git a/sdk/etrecord/TARGETS b/devtools/etrecord/TARGETS similarity index 71% rename from sdk/etrecord/TARGETS rename to devtools/etrecord/TARGETS index c7de63a81f4..09fc3212bf8 100644 --- a/sdk/etrecord/TARGETS +++ b/devtools/etrecord/TARGETS @@ -9,10 +9,10 @@ python_library( "_etrecord.py", ], deps = [ + "//executorch/devtools/bundled_program:core", + "//executorch/devtools/bundled_program/schema:bundled_program_schema_py", "//executorch/exir:lib", "//executorch/exir/emit:emit", "//executorch/exir/serde:serialize", - "//executorch/sdk/bundled_program:core", - "//executorch/sdk/bundled_program/schema:bundled_program_schema_py", ], ) diff --git a/sdk/etrecord/__init__.py b/devtools/etrecord/__init__.py similarity index 86% rename from sdk/etrecord/__init__.py rename to devtools/etrecord/__init__.py index 29c29462a7e..59ff4e44c2f 100644 --- a/sdk/etrecord/__init__.py +++ b/devtools/etrecord/__init__.py @@ -4,7 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from executorch.sdk.etrecord._etrecord import ( +from executorch.devtools.etrecord._etrecord import ( ETRecord, generate_etrecord, parse_etrecord, diff --git a/sdk/etrecord/_etrecord.py b/devtools/etrecord/_etrecord.py similarity index 96% rename from sdk/etrecord/_etrecord.py rename to devtools/etrecord/_etrecord.py index 55e231f2166..de7cf93990a 100644 --- a/sdk/etrecord/_etrecord.py +++ b/devtools/etrecord/_etrecord.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import json import os import pickle @@ -12,6 +14,9 @@ from zipfile import BadZipFile, ZipFile from executorch import exir +from executorch.devtools.bundled_program.core import BundledProgram + +from executorch.devtools.bundled_program.schema.bundled_program_schema import Value from executorch.exir import ( EdgeProgramManager, ExecutorchProgram, @@ -23,9 +28,6 @@ from executorch.exir.serde.export_serialize import SerializedArtifact from executorch.exir.serde.serialize import deserialize, serialize -from executorch.sdk.bundled_program.core import BundledProgram - -from executorch.sdk.bundled_program.schema.bundled_program_schema import Value ProgramOutput = List[Value] @@ -182,13 +184,13 @@ def generate_etrecord( is the closest graph module representation of what is eventually run on the device. In addition to all the graph modules, we also serialize the program buffer, which the users can provide to the ExecuTorch runtime to run the model, and the debug handle map - for SDK tooling usage. + for Developer Tools usage. Args: - etrecord_path: Path to where the `ETRecord` file will be saved to. + et_record: Path to where the `ETRecord` file will be saved to. edge_dialect_program: `EdgeProgramManager` for this model returned by the call to to_edge() executorch_program: The ExecuTorch program for this model returned by the call to `to_executorch()` or the `BundledProgram` of this model - export_modules[Optional]: **Should be ignored by OSS users**. A dictionary of graph modules with the key being the user provided name and the + export_modules [Optional]: **Should be ignored by OSS users**. A dictionary of graph modules with the key being the user provided name and the value being the corresponding exported module. The exported graph modules can be either the output of `torch.export()` or `exir.to_edge()`. @@ -201,7 +203,7 @@ def generate_etrecord( etrecord_zip = ZipFile(et_record, "w") # Write the magic file identifier that will be used to verify that this file - # is an etrecord when it's used later in the SDK tooling. + # is an etrecord when it's used later in the Developer Tools. etrecord_zip.writestr(ETRecordReservedFileNames.ETRECORD_IDENTIFIER, "") if export_modules is not None: diff --git a/sdk/etrecord/tests/TARGETS b/devtools/etrecord/tests/TARGETS similarity index 64% rename from sdk/etrecord/tests/TARGETS rename to devtools/etrecord/tests/TARGETS index 0984c755a4e..fffa7f18341 100644 --- a/sdk/etrecord/tests/TARGETS +++ b/devtools/etrecord/tests/TARGETS @@ -8,11 +8,11 @@ python_unittest( srcs = ["etrecord_test.py"], deps = [ "//caffe2:torch", + "//executorch/devtools/bundled_program:config", + "//executorch/devtools/bundled_program:core", + "//executorch/devtools/etrecord:etrecord", "//executorch/exir:lib", "//executorch/exir/tests:models", - "//executorch/sdk/bundled_program:config", - "//executorch/sdk/bundled_program:core", - "//executorch/sdk/etrecord:etrecord", ], ) @@ -21,10 +21,10 @@ python_library( srcs = ["etrecord_test.py"], deps = [ "//caffe2:torch", + "//executorch/devtools/bundled_program:config", + "//executorch/devtools/bundled_program:core", + "//executorch/devtools/etrecord:etrecord", "//executorch/exir:lib", "//executorch/exir/tests:models", - "//executorch/sdk/bundled_program:config", - "//executorch/sdk/bundled_program:core", - "//executorch/sdk/etrecord:etrecord", ], ) diff --git a/sdk/etrecord/tests/etrecord_test.py b/devtools/etrecord/tests/etrecord_test.py similarity index 95% rename from sdk/etrecord/tests/etrecord_test.py rename to devtools/etrecord/tests/etrecord_test.py index bc534fd4871..daef7c3e1e2 100644 --- a/sdk/etrecord/tests/etrecord_test.py +++ b/devtools/etrecord/tests/etrecord_test.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import copy import json import tempfile @@ -12,14 +14,14 @@ import executorch.exir.tests.models as models import torch from executorch import exir -from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge -from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite -from executorch.sdk.bundled_program.core import BundledProgram -from executorch.sdk.etrecord import generate_etrecord, parse_etrecord -from executorch.sdk.etrecord._etrecord import ( +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.core import BundledProgram +from executorch.devtools.etrecord import generate_etrecord, parse_etrecord +from executorch.devtools.etrecord._etrecord import ( _get_reference_outputs, ETRecordReservedFileNames, ) +from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge from torch.export import export @@ -75,7 +77,7 @@ def get_test_model_with_manager(self): return (aten_dialect, edge_program_copy, edge_program.to_executorch()) # Serialized and deserialized graph modules are not completely the same, so we check - # that they are close enough and match especially on the parameters we care about in the SDK. + # that they are close enough and match especially on the parameters we care about in the Developer Tools. def check_graph_closeness(self, graph_a, graph_b): self.assertEqual(len(graph_a.graph.nodes), len(graph_b.graph.nodes)) for node_a, node_b in zip(graph_a.graph.nodes, graph_b.graph.nodes): diff --git a/sdk/inspector/TARGETS b/devtools/inspector/TARGETS similarity index 67% rename from sdk/inspector/TARGETS rename to devtools/inspector/TARGETS index bc53c90c115..bba5f7f8951 100644 --- a/sdk/inspector/TARGETS +++ b/devtools/inspector/TARGETS @@ -14,10 +14,10 @@ python_library( "fbsource//third-party/pypi/pandas:pandas", "fbsource//third-party/pypi/tabulate:tabulate", ":inspector_utils", + "//executorch/devtools/debug_format:et_schema", + "//executorch/devtools/etdump:schema_flatcc", + "//executorch/devtools/etrecord:etrecord", "//executorch/exir:lib", - "//executorch/sdk/debug_format:et_schema", - "//executorch/sdk/etdump:schema_flatcc", - "//executorch/sdk/etrecord:etrecord", ], ) @@ -26,8 +26,8 @@ python_binary( main_function = ".inspector_cli.main", main_src = "inspector_cli.py", deps = [ - ":inspector_utils", - "//executorch/sdk:lib", + "//executorch/devtools:lib", + "//executorch/devtools/inspector:lib", ], ) @@ -40,11 +40,11 @@ python_library( "fbsource//third-party/pypi/matplotlib:matplotlib", "fbsource//third-party/pypi/numpy:numpy", "//caffe2:torch", - "//executorch/sdk/debug_format:base_schema", - "//executorch/sdk/debug_format:et_schema", - "//executorch/sdk/etdump:schema_flatcc", - "//executorch/sdk/etdump:serialize", - "//executorch/sdk/etrecord:etrecord", + "//executorch/devtools/debug_format:base_schema", + "//executorch/devtools/debug_format:et_schema", + "//executorch/devtools/etdump:schema_flatcc", + "//executorch/devtools/etdump:serialize", + "//executorch/devtools/etrecord:etrecord", ], ) diff --git a/devtools/inspector/__init__.py b/devtools/inspector/__init__.py new file mode 100644 index 00000000000..375123a0a5b --- /dev/null +++ b/devtools/inspector/__init__.py @@ -0,0 +1,24 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +from executorch.devtools.inspector._inspector import ( + Event, + EventBlock, + Inspector, + PerfData, +) +from executorch.devtools.inspector._inspector_utils import compare_results, TimeScale + +__all__ = [ + "Event", + "EventBlock", + "Inspector", + "PerfData", + "compare_results", + "TimeScale", +] diff --git a/sdk/inspector/_inspector.py b/devtools/inspector/_inspector.py similarity index 94% rename from sdk/inspector/_inspector.py rename to devtools/inspector/_inspector.py index 5f9bfafee70..0539d4f5e4b 100644 --- a/sdk/inspector/_inspector.py +++ b/devtools/inspector/_inspector.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import dataclasses import logging import sys @@ -26,16 +28,20 @@ Union, ) -import executorch.sdk.etdump.schema_flatcc as flatcc +import executorch.devtools.etdump.schema_flatcc as flatcc import numpy as np import pandas as pd -from executorch.exir import ExportedProgram -from executorch.sdk.debug_format.et_schema import OperatorGraph, OperatorNode -from executorch.sdk.etdump.schema_flatcc import DebugEvent, ETDumpFlatCC, ProfileEvent -from executorch.sdk.etrecord import ETRecord, parse_etrecord -from executorch.sdk.inspector._inspector_utils import ( +from executorch.devtools.debug_format.et_schema import OperatorGraph, OperatorNode +from executorch.devtools.etdump.schema_flatcc import ( + DebugEvent, + ETDumpFlatCC, + ProfileEvent, +) +from executorch.devtools.etrecord import ETRecord, parse_etrecord +from executorch.devtools.inspector._inspector_utils import ( + calculate_time_scale_factor, create_debug_handle_to_op_node_mapping, EDGE_DIALECT_GRAPH_KEY, EXCLUDED_COLUMNS_WHEN_PRINTING, @@ -49,10 +55,10 @@ is_inference_output_equal, ProgramOutput, RESERVED_FRAMEWORK_EVENT_NAMES, - TIME_SCALE_DICT, TimeScale, verify_debug_data_equivalence, ) +from executorch.exir import ExportedProgram from tabulate import tabulate @@ -146,6 +152,7 @@ def _gen_from_event(event: ProfileEvent) -> "ProfileEventSignature": # Signature of a DebugEvent @dataclass(frozen=True, order=True) class DebugEventSignature: + name: str = "" instruction_id: Optional[int] = -1 delegate_id: Optional[int] = None delegate_id_str: Optional[str] = None @@ -159,6 +166,7 @@ def _gen_from_event(event: DebugEvent) -> "DebugEventSignature": The Signature will convert these back to the intended None value """ return DebugEventSignature( + event.name or "", event.instruction_id if event.instruction_id != -1 else None, event.delegate_debug_id_int if event.delegate_debug_id_int != -1 else None, event.delegate_debug_id_str if event.delegate_debug_id_str != "" else None, @@ -464,46 +472,63 @@ def _calculate_elapsed_time(start_time, end_time): return elapsed_time @staticmethod - def _populate_profiling_related_fields( + def _populate_event_signature_fields( ret_event: "Event", - profile_event_signature: Optional[ProfileEventSignature], - events: List[InstructionEvent], - scale_factor: float, + event_signature: Optional[Union[ProfileEventSignature, DebugEventSignature]], ) -> None: """ Given a partially constructed Event, populate the fields related to - the profile events + the profile event signature or debug event signature Fields Updated: name delegate_debug_identifier is_delegated_op - perf_data - delegate_debug_metadatas """ - - # Fill out fields from profile event signature - if profile_event_signature is not None: - if profile_event_signature.delegate_id is not None: # 0 is a valid value - delegate_debug_identifier = profile_event_signature.delegate_id + # TODO: T201347372 Push the None check to ealier in the stack. + if event_signature is not None: + if event_signature.delegate_id is not None: # 0 is a valid value + delegate_debug_identifier = event_signature.delegate_id else: - delegate_debug_identifier = ( - profile_event_signature.delegate_id_str or None - ) + delegate_debug_identifier = event_signature.delegate_id_str or None # Use the delegate identifier as the event name if delegated is_delegated_op = delegate_debug_identifier is not None name = ( - profile_event_signature.name + event_signature.name if not is_delegated_op else str(delegate_debug_identifier) ) # Update fields - ret_event.name = name + # This is for older version of etdump that doesn't have the name field for debug events, we don't update the name field + if name: + ret_event.name = name ret_event.delegate_debug_identifier = delegate_debug_identifier ret_event.is_delegated_op = is_delegated_op + @staticmethod + def _populate_profiling_related_fields( + ret_event: "Event", + profile_event_signature: Optional[ProfileEventSignature], + events: List[InstructionEvent], + scale_factor: float, + ) -> None: + """ + Given a partially constructed Event, populate the fields related to + the profile events + + Fields Updated: + name + delegate_debug_identifier + is_delegated_op + perf_data + delegate_debug_metadatas + """ + + # Fill out fields from profile event signature + Event._populate_event_signature_fields(ret_event, profile_event_signature) + # Fill out fields from profile event data = [] delegate_debug_metadatas = [] @@ -571,9 +596,15 @@ def _populate_debugging_related_fields( the debug events Fields Updated: + name + delegate_debug_identifier + is_delegated_op debug_data """ + # Fill out fields from debug event signature + Event._populate_event_signature_fields(ret_event, debug_event_signature) + debug_data: List[flatcc.Value] = [] for event in events: if (debug_events := event.debug_events) is None: @@ -795,9 +826,7 @@ class GroupedRunInstances: # Construct the EventBlocks event_blocks = [] - scale_factor = ( - TIME_SCALE_DICT[source_time_scale] / TIME_SCALE_DICT[target_time_scale] - ) + scale_factor = calculate_time_scale_factor(source_time_scale, target_time_scale) for run_signature, grouped_run_instance in run_groups.items(): run_group: OrderedDict[EventSignature, List[InstructionEvent]] = ( grouped_run_instance.events @@ -962,6 +991,9 @@ def __init__( debug_buffer_path: Debug buffer file path that contains the debug data referenced by ETDump for intermediate and program outputs. delegate_metadata_parser: Optional function to parse delegate metadata from an Profiling Event. Expected signature of the function is: (delegate_metadata_list: List[bytes]) -> Union[List[str], Dict[str, Any]] + delegate_time_scale_converter: Optional function to convert the time scale of delegate profiling data. If not given, use the conversion ratio of + target_time_scale/source_time_scale. + enable_module_hierarchy: Enable submodules in the operator graph. Defaults to False. Returns: None @@ -976,6 +1008,14 @@ def __init__( self._source_time_scale = source_time_scale self._target_time_scale = target_time_scale + if delegate_time_scale_converter is None: + scale_factor = calculate_time_scale_factor( + source_time_scale, target_time_scale + ) + delegate_time_scale_converter = ( + lambda event_name, input_time: input_time / scale_factor + ) + if etrecord is None: self._etrecord = None elif isinstance(etrecord, ETRecord): @@ -998,10 +1038,10 @@ def __init__( ) self.event_blocks = EventBlock._gen_from_etdump( - etdump, - self._source_time_scale, - self._target_time_scale, - output_buffer, + etdump=etdump, + source_time_scale=self._source_time_scale, + target_time_scale=self._target_time_scale, + output_buffer=output_buffer, delegate_metadata_parser=delegate_metadata_parser, delegate_time_scale_converter=delegate_time_scale_converter, ) diff --git a/sdk/inspector/_inspector_utils.py b/devtools/inspector/_inspector_utils.py similarity index 95% rename from sdk/inspector/_inspector_utils.py rename to devtools/inspector/_inspector_utils.py index 6879e855057..5f04e2d0413 100644 --- a/sdk/inspector/_inspector_utils.py +++ b/devtools/inspector/_inspector_utils.py @@ -4,18 +4,20 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import math from enum import Enum from typing import Dict, List, Mapping, Optional, Tuple, TypeAlias, Union -import executorch.sdk.etdump.schema_flatcc as flatcc +import executorch.devtools.etdump.schema_flatcc as flatcc import torch -from executorch.sdk.debug_format.base_schema import OperatorNode +from executorch.devtools.debug_format.base_schema import OperatorNode -from executorch.sdk.debug_format.et_schema import FXOperatorGraph, OperatorGraph -from executorch.sdk.etdump.schema_flatcc import ( +from executorch.devtools.debug_format.et_schema import FXOperatorGraph, OperatorGraph +from executorch.devtools.etdump.schema_flatcc import ( DebugEvent, ETDumpFlatCC, ProfileEvent, @@ -25,8 +27,8 @@ ValueType, ) -from executorch.sdk.etdump.serialize import deserialize_from_etdump_flatcc -from executorch.sdk.etrecord import ETRecord +from executorch.devtools.etdump.serialize import deserialize_from_etdump_flatcc +from executorch.devtools.etrecord import ETRecord FORWARD = "forward" EDGE_DIALECT_GRAPH_KEY = "edge_dialect_graph_module" @@ -63,6 +65,15 @@ class TimeScale(Enum): } +def calculate_time_scale_factor( + source_time_scale: TimeScale, target_time_scale: TimeScale +) -> float: + """ + Calculate the factor (source divided by target) between two time scales + """ + return TIME_SCALE_DICT[source_time_scale] / TIME_SCALE_DICT[target_time_scale] + + # Model Debug Output InferenceOutput: TypeAlias = Union[ torch.Tensor, List[torch.Tensor], int, float, str, bool, None diff --git a/sdk/inspector/inspector_cli.py b/devtools/inspector/inspector_cli.py similarity index 93% rename from sdk/inspector/inspector_cli.py rename to devtools/inspector/inspector_cli.py index d6c8d5442f3..db3536a84bf 100644 --- a/sdk/inspector/inspector_cli.py +++ b/devtools/inspector/inspector_cli.py @@ -4,10 +4,12 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import argparse -from executorch.sdk import Inspector -from executorch.sdk.inspector._inspector_utils import compare_results, TimeScale +from executorch.devtools import Inspector +from executorch.devtools.inspector import compare_results, TimeScale def main() -> None: diff --git a/devtools/inspector/tests/TARGETS b/devtools/inspector/tests/TARGETS new file mode 100644 index 00000000000..eada6817bcb --- /dev/null +++ b/devtools/inspector/tests/TARGETS @@ -0,0 +1,41 @@ +load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest") + +oncall("executorch") + +python_unittest( + name = "inspector_test", + srcs = ["inspector_test.py"], + deps = [ + "//executorch/devtools:lib", + "//executorch/devtools/debug_format:et_schema", + "//executorch/devtools/etdump:schema_flatcc", + "//executorch/devtools/etrecord/tests:etrecord_test_library", + "//executorch/devtools/inspector:inspector", + "//executorch/devtools/inspector:lib", + "//executorch/exir:lib", + ], +) + +python_unittest( + name = "event_blocks_test", + srcs = ["event_blocks_test.py"], + deps = [ + "//executorch/devtools/etdump:schema_flatcc", + "//executorch/devtools/inspector:inspector", + "//executorch/devtools/inspector:lib", + ], +) + +python_unittest( + name = "inspector_utils_test", + srcs = ["inspector_utils_test.py"], + deps = [ + "//caffe2:torch", + "//executorch/devtools:lib", + "//executorch/devtools/debug_format:base_schema", + "//executorch/devtools/debug_format:et_schema", + "//executorch/devtools/etdump:schema_flatcc", + "//executorch/devtools/etrecord/tests:etrecord_test_library", + "//executorch/devtools/inspector:inspector_utils", + ], +) diff --git a/sdk/inspector/tests/event_blocks_test.py b/devtools/inspector/tests/event_blocks_test.py similarity index 89% rename from sdk/inspector/tests/event_blocks_test.py rename to devtools/inspector/tests/event_blocks_test.py index 7c7da001860..85b65aa5f34 100644 --- a/sdk/inspector/tests/event_blocks_test.py +++ b/devtools/inspector/tests/event_blocks_test.py @@ -8,10 +8,10 @@ import unittest from typing import List, Optional, Tuple, Union -import executorch.sdk.etdump.schema_flatcc as flatcc -from executorch.sdk.etdump.schema_flatcc import ETDumpFlatCC, ProfileEvent -from executorch.sdk.inspector import Event, EventBlock, PerfData -from executorch.sdk.inspector._inspector import ( +import executorch.devtools.etdump.schema_flatcc as flatcc +from executorch.devtools.etdump.schema_flatcc import ETDumpFlatCC, ProfileEvent +from executorch.devtools.inspector import Event, EventBlock, PerfData +from executorch.devtools.inspector._inspector import ( DelegateMetadata, EventSignature, InstructionEvent, @@ -62,6 +62,7 @@ def _gen_sample_profile_event( def _gen_sample_debug_event( instruction_id: int, delegate_debug_id: Optional[Union[int, str]] = None, + name: str = "test_debug_event", ) -> flatcc.DebugEvent: """ Helper for generating test DebugEvents @@ -77,6 +78,7 @@ def _gen_sample_debug_event( ) return flatcc.DebugEvent( + name=name, chain_index=0, instruction_id=instruction_id, delegate_debug_id_int=delegate_debug_id_int, @@ -299,6 +301,42 @@ def _get_sample_etdump_flatcc_profiling_and_debugging() -> flatcc.ETDumpFlatCC: return ETDumpFlatCC(version=0, run_data=[run_data_1, run_data_2, run_data_3]) + @staticmethod + def _get_sample_etdump_flatcc_debug_events_only( + event_name: str, + delegate_debug_id: str, + ) -> flatcc.ETDumpFlatCC: + """ + Helper for getting a sample ETDumpFlatCC object with RunData signature_a + and (debug_event_delegated, debug_event_non_delegated, no profile event) + """ + + debug_event_delegated = TestEventBlock._gen_sample_debug_event( + instruction_id=1, delegate_debug_id=delegate_debug_id, name=event_name + ) + debug_event_non_delegated = TestEventBlock._gen_sample_debug_event( + instruction_id=1, name=event_name + ) + run_data_1 = flatcc.RunData( + name="signature_a", + bundled_input_index=-1, + allocators=[], + events=[ + flatcc.Event( + allocation_event=None, + debug_event=debug_event_delegated, + profile_event=None, + ), + flatcc.Event( + allocation_event=None, + debug_event=debug_event_non_delegated, + profile_event=None, + ), + ], + ) + + return ETDumpFlatCC(version=0, run_data=[run_data_1]) + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tests ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def test_gen_from_etdump(self) -> None: @@ -370,6 +408,30 @@ def test_gen_from_etdump_inconsistent_debug_data(self) -> None: with self.assertRaises(AssertionError): EventBlock._gen_from_etdump(etdump) + def test_gen_from_etdump_debug_events_only(self) -> None: + """ + Test generation of EventBlocks given an ETDump with only debugging events + + Specifically it tests: + - Correct number of EventBlocks and Events + - Correct name of each Event + """ + event_name = "test_debug_event_only" + delegate_debug_id = "debug_id" + etdump: ETDumpFlatCC = ( + TestEventBlock._get_sample_etdump_flatcc_debug_events_only( + event_name=event_name, + delegate_debug_id=delegate_debug_id, + ) + ) + event_blocks = EventBlock._gen_from_etdump(etdump) + self.assertEqual(len(event_blocks), 1) + self.assertEqual(len(event_blocks[0].events), 2) + # Delegated event uses delegate_debug_id as event name + self.assertEqual(event_blocks[0].events[0].name, delegate_debug_id) + # Non delegated event uses event_name as event name + self.assertEqual(event_blocks[0].events[1].name, event_name) + def test_inspector_event_generation(self) -> None: """ Test Inspector.Event derivation from various ProfileEvent cases diff --git a/sdk/inspector/tests/inspector_test.py b/devtools/inspector/tests/inspector_test.py similarity index 89% rename from sdk/inspector/tests/inspector_test.py rename to devtools/inspector/tests/inspector_test.py index a372c7c569c..34c96eef534 100644 --- a/sdk/inspector/tests/inspector_test.py +++ b/devtools/inspector/tests/inspector_test.py @@ -4,31 +4,41 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import random import statistics import tempfile import unittest from contextlib import redirect_stdout -from typing import List +from typing import Callable, List from unittest.mock import patch -from executorch.exir import ExportedProgram -from executorch.sdk import generate_etrecord, parse_etrecord -from executorch.sdk.debug_format.et_schema import OperatorNode -from executorch.sdk.etdump.schema_flatcc import ProfileEvent -from executorch.sdk.etrecord.tests.etrecord_test import TestETRecord - -from executorch.sdk.inspector import _inspector, Event, EventBlock, Inspector, PerfData -from executorch.sdk.inspector._inspector import ( +from executorch.devtools import generate_etrecord, parse_etrecord +from executorch.devtools.debug_format.et_schema import OperatorNode +from executorch.devtools.etdump.schema_flatcc import ProfileEvent +from executorch.devtools.etrecord.tests.etrecord_test import TestETRecord + +from executorch.devtools.inspector import ( + _inspector, + Event, + EventBlock, + Inspector, + PerfData, +) +from executorch.devtools.inspector._inspector import ( DebugEventSignature, flatcc, InstructionEvent, InstructionEventSignature, ProfileEventSignature, + TimeScale, ) +from executorch.exir import ExportedProgram + OP_TYPE = "aten::add" EVENT_BLOCK_NAME = "block_0" @@ -81,6 +91,33 @@ def test_inspector_constructor(self): # Because we mocked parse_etrecord() to return None, this method shouldn't be called mock_gen_graphs_from_etrecord.assert_not_called() + def test_default_delegate_time_scale_converter(self): + # Create a context manager to patch functions called by Inspector.__init__ + with patch.object( + _inspector, "parse_etrecord", return_value=None + ), patch.object( + _inspector, "gen_etdump_object", return_value=None + ), patch.object( + EventBlock, "_gen_from_etdump" + ) as mock_gen_from_etdump, patch.object( + _inspector, "gen_graphs_from_etrecord" + ), patch.object( + _inspector, "create_debug_handle_to_op_node_mapping" + ): + # Call the constructor of Inspector + Inspector( + etdump_path=ETDUMP_PATH, + etrecord=ETRECORD_PATH, + source_time_scale=TimeScale.US, + target_time_scale=TimeScale.S, + ) + + # Verify delegate_time_scale_converter is set to be a callable + self.assertIsInstance( + mock_gen_from_etdump.call_args.get("delegate_time_scale_converter"), + Callable, + ) + def test_inspector_print_data_tabular(self): # Create a context manager to patch functions called by Inspector.__init__ with patch.object( @@ -281,6 +318,7 @@ def test_populate_debugging_related_fields_raises_for_inconsistent_events(self): ) debug_event_0 = flatcc.DebugEvent( + name="event", chain_index=1, instruction_id=0, delegate_debug_id_int=1, @@ -304,6 +342,7 @@ def test_populate_debugging_related_fields_raises_for_inconsistent_events(self): # Note the sizes of this tensor are different from the previous one debug_event_1 = flatcc.DebugEvent( + name="event", chain_index=1, instruction_id=0, delegate_debug_id_int=1, @@ -348,6 +387,7 @@ def test_populate_debugging_related_fields_passes_for_consistent_events(self): ) debug_event_0 = flatcc.DebugEvent( + name="event", chain_index=1, instruction_id=0, delegate_debug_id_int=1, @@ -371,6 +411,7 @@ def test_populate_debugging_related_fields_passes_for_consistent_events(self): # Same as the event above except for offset debug_event_1 = flatcc.DebugEvent( + name="event", chain_index=1, instruction_id=0, delegate_debug_id_int=1, diff --git a/sdk/inspector/tests/inspector_utils_test.py b/devtools/inspector/tests/inspector_utils_test.py similarity index 88% rename from sdk/inspector/tests/inspector_utils_test.py rename to devtools/inspector/tests/inspector_utils_test.py index b5b9b54d6c4..73511f5fcd7 100644 --- a/sdk/inspector/tests/inspector_utils_test.py +++ b/devtools/inspector/tests/inspector_utils_test.py @@ -4,30 +4,34 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# pyre-unsafe + import tempfile import unittest from typing import Dict, Tuple import torch -from executorch.sdk import generate_etrecord, parse_etrecord +from executorch.devtools import generate_etrecord, parse_etrecord -from executorch.sdk.debug_format.base_schema import ( +from executorch.devtools.debug_format.base_schema import ( OperatorGraph, OperatorNode, ValueNode, ) -from executorch.sdk.debug_format.et_schema import FXOperatorGraph -from executorch.sdk.etdump import schema_flatcc as flatcc +from executorch.devtools.debug_format.et_schema import FXOperatorGraph +from executorch.devtools.etdump import schema_flatcc as flatcc -from executorch.sdk.etrecord.tests.etrecord_test import TestETRecord -from executorch.sdk.inspector._inspector_utils import ( +from executorch.devtools.etrecord.tests.etrecord_test import TestETRecord +from executorch.devtools.inspector._inspector_utils import ( + calculate_time_scale_factor, create_debug_handle_to_op_node_mapping, EDGE_DIALECT_GRAPH_KEY, find_populated_event, gen_graphs_from_etrecord, is_inference_output_equal, + TimeScale, ) @@ -74,6 +78,7 @@ def test_find_populated_event(self): end_time=2002, ) debug_event = flatcc.DebugEvent( + name="test_debug_event", chain_index=1, instruction_id=0, delegate_debug_id_str="56", @@ -170,6 +175,19 @@ def test_is_inference_output_equal_returns_true_for_same_strs(self): ) ) + def test_calculate_time_scale_factor_second_based(self): + self.assertEqual( + calculate_time_scale_factor(TimeScale.NS, TimeScale.MS), 1000000 + ) + self.assertEqual( + calculate_time_scale_factor(TimeScale.MS, TimeScale.NS), 1 / 1000000 + ) + + def test_calculate_time_scale_factor_cycles(self): + self.assertEqual( + calculate_time_scale_factor(TimeScale.CYCLES, TimeScale.CYCLES), 1 + ) + def gen_mock_operator_graph_with_expected_map() -> ( Tuple[OperatorGraph, Dict[int, OperatorNode]] diff --git a/sdk/size_analysis_tool/TARGETS b/devtools/size_analysis_tool/TARGETS similarity index 86% rename from sdk/size_analysis_tool/TARGETS rename to devtools/size_analysis_tool/TARGETS index 44ae0aa6f8b..c365ba152d5 100644 --- a/sdk/size_analysis_tool/TARGETS +++ b/devtools/size_analysis_tool/TARGETS @@ -12,9 +12,9 @@ python_library( visibility = ["PUBLIC"], deps = [ "//caffe2:torch", + "//executorch/devtools:lib", "//executorch/exir:lib", "//executorch/exir/backend:backend_api", - "//executorch/sdk:lib", ], ) @@ -23,13 +23,13 @@ python_binary( srcs = [ "size_analysis_tool.py", ], - main_function = "executorch.sdk.size_analysis_tool.size_analysis_tool.main", + main_function = "executorch.devtools.size_analysis_tool.size_analysis_tool.main", visibility = ["PUBLIC"], deps = [ "//caffe2:torch", + "//executorch/devtools:lib", "//executorch/exir:lib", "//executorch/exir/backend:backend_api", - "//executorch/sdk:lib", ], ) @@ -43,9 +43,9 @@ python_unittest( "//caffe2:torch", "//executorch/backends/xnnpack/partition:xnnpack_partitioner", "//executorch/backends/xnnpack/utils:xnnpack_utils", + "//executorch/devtools:lib", "//executorch/exir:lib", "//executorch/exir/backend:backend_api", "//executorch/exir/passes:spec_prop_pass", - "//executorch/sdk:lib", ], ) diff --git a/sdk/size_analysis_tool/size_analysis_tool.py b/devtools/size_analysis_tool/size_analysis_tool.py similarity index 99% rename from sdk/size_analysis_tool/size_analysis_tool.py rename to devtools/size_analysis_tool/size_analysis_tool.py index d17ec5ac477..8ea8ddbbf49 100644 --- a/sdk/size_analysis_tool/size_analysis_tool.py +++ b/devtools/size_analysis_tool/size_analysis_tool.py @@ -9,10 +9,10 @@ from typing import Any, Callable, Dict, List, Optional, Tuple import torch +from executorch.devtools import parse_etrecord from executorch.exir import ExportedProgram from executorch.exir.backend.backend_api import LoweredBackendModule -from executorch.sdk import parse_etrecord def _get_tensor_data(node: torch.fx.Node, tensor: torch.Tensor) -> Dict[str, Any]: diff --git a/sdk/size_analysis_tool/size_analysis_tool_test.py b/devtools/size_analysis_tool/size_analysis_tool_test.py similarity index 98% rename from sdk/size_analysis_tool/size_analysis_tool_test.py rename to devtools/size_analysis_tool/size_analysis_tool_test.py index 3e1efec77b5..96feae7e423 100644 --- a/sdk/size_analysis_tool/size_analysis_tool_test.py +++ b/devtools/size_analysis_tool/size_analysis_tool_test.py @@ -14,12 +14,12 @@ get_xnnpack_executorch_backend_config, ) from executorch.backends.xnnpack.utils.utils import capture_graph_for_xnnpack -from executorch.exir.backend.backend_api import to_backend, validation_disabled -from executorch.exir.passes.spec_prop_pass import SpecPropPass -from executorch.sdk.size_analysis_tool.size_analysis_tool import ( +from executorch.devtools.size_analysis_tool.size_analysis_tool import ( generate_model_size_information, ) +from executorch.exir.backend.backend_api import to_backend, validation_disabled +from executorch.exir.passes.spec_prop_pass import SpecPropPass class SizeAnalysisToolTest(unittest.TestCase): diff --git a/sdk/targets.bzl b/devtools/targets.bzl similarity index 76% rename from sdk/targets.bzl rename to devtools/targets.bzl index 38c2e6e820e..17d9e89cad3 100644 --- a/sdk/targets.bzl +++ b/devtools/targets.bzl @@ -4,5 +4,5 @@ def build_sdk(): def get_sdk_flags(): sdk_flags = [] if build_sdk(): - sdk_flags += ["-DEXECUTORCH_BUILD_SDK"] + sdk_flags += ["-DEXECUTORCH_BUILD_DEVTOOLS"] return sdk_flags diff --git a/docs/source/Doxyfile b/docs/source/Doxyfile index b741509197d..e662105b83f 100644 --- a/docs/source/Doxyfile +++ b/docs/source/Doxyfile @@ -964,8 +964,7 @@ INPUT = ../runtime/executor/memory_manager.h \ ../runtime/core/tensor_shape_dynamism.h \ ../runtime/platform/compiler.h \ ../runtime/executor/ \ - ../runtime/platform/ \ - ../util/ + ../runtime/platform/ diff --git a/docs/source/_static/img/benchmark-infra.png b/docs/source/_static/img/benchmark-infra.png new file mode 100644 index 00000000000..a5d30774257 Binary files /dev/null and b/docs/source/_static/img/benchmark-infra.png differ diff --git a/docs/source/_static/img/chat.png b/docs/source/_static/img/chat.png new file mode 100644 index 00000000000..e7ed934519d Binary files /dev/null and b/docs/source/_static/img/chat.png differ diff --git a/docs/source/_static/img/chat_response.png b/docs/source/_static/img/chat_response.png new file mode 100644 index 00000000000..714265276fe Binary files /dev/null and b/docs/source/_static/img/chat_response.png differ diff --git a/docs/source/_static/img/ios_demo_app.jpg b/docs/source/_static/img/ios_demo_app.jpg new file mode 100644 index 00000000000..076508d0e0d Binary files /dev/null and b/docs/source/_static/img/ios_demo_app.jpg differ diff --git a/docs/source/_static/img/ios_demo_app_choosing_package.png b/docs/source/_static/img/ios_demo_app_choosing_package.png new file mode 100644 index 00000000000..20599d7ea80 Binary files /dev/null and b/docs/source/_static/img/ios_demo_app_choosing_package.png differ diff --git a/docs/source/_static/img/ios_demo_app_llava.jpg b/docs/source/_static/img/ios_demo_app_llava.jpg new file mode 100644 index 00000000000..316d68b71bd Binary files /dev/null and b/docs/source/_static/img/ios_demo_app_llava.jpg differ diff --git a/docs/source/_static/img/ios_demo_app_mps.jpg b/docs/source/_static/img/ios_demo_app_mps.jpg new file mode 100644 index 00000000000..58114f869c6 Binary files /dev/null and b/docs/source/_static/img/ios_demo_app_mps.jpg differ diff --git a/docs/source/_static/img/ios_demo_app_swift_pm.png b/docs/source/_static/img/ios_demo_app_swift_pm.png new file mode 100644 index 00000000000..19e7a6726e1 Binary files /dev/null and b/docs/source/_static/img/ios_demo_app_swift_pm.png differ diff --git a/docs/source/_static/img/llava_example.png b/docs/source/_static/img/llava_example.png new file mode 100644 index 00000000000..ccac335ee65 Binary files /dev/null and b/docs/source/_static/img/llava_example.png differ diff --git a/docs/source/_static/img/load_complete_and_start_prompt.png b/docs/source/_static/img/load_complete_and_start_prompt.png new file mode 100644 index 00000000000..43d81f10d00 Binary files /dev/null and b/docs/source/_static/img/load_complete_and_start_prompt.png differ diff --git a/docs/source/_static/img/logs.png b/docs/source/_static/img/logs.png new file mode 100644 index 00000000000..e35227a1c0c Binary files /dev/null and b/docs/source/_static/img/logs.png differ diff --git a/docs/source/_static/img/mtk_changes_to_shell_file.png b/docs/source/_static/img/mtk_changes_to_shell_file.png new file mode 100644 index 00000000000..7fa4e461863 Binary files /dev/null and b/docs/source/_static/img/mtk_changes_to_shell_file.png differ diff --git a/docs/source/_static/img/mtk_output.png b/docs/source/_static/img/mtk_output.png new file mode 100644 index 00000000000..e41d54c3561 Binary files /dev/null and b/docs/source/_static/img/mtk_output.png differ diff --git a/docs/source/_static/img/opening_the_app_details.png b/docs/source/_static/img/opening_the_app_details.png new file mode 100644 index 00000000000..60494ecc69d Binary files /dev/null and b/docs/source/_static/img/opening_the_app_details.png differ diff --git a/docs/source/_static/img/settings_menu.png b/docs/source/_static/img/settings_menu.png new file mode 100644 index 00000000000..028e6b55cd7 Binary files /dev/null and b/docs/source/_static/img/settings_menu.png differ diff --git a/docs/source/apple-runtime.md b/docs/source/apple-runtime.md index 2378ddc2bd2..023903db3b2 100644 --- a/docs/source/apple-runtime.md +++ b/docs/source/apple-runtime.md @@ -19,6 +19,19 @@ Link your binary with the ExecuTorch runtime and any backends or kernels used by ## Integration +### Setup + +#### CMake + +Building the Xcode project requires CMake. Installing via homebrew does not +typically work; instead, install the packaged application and commandline tools +globally: + +1. Download the macOS `.dmg` installer from https://cmake.org/download +2. Open the `.dmg` +3. Drag the CMake app to the `/Applications` folder +4. In a terminal, install the command line tools: `sudo /Applications/CMake.app/Contents/bin/cmake-gui --install` + ### Swift Package Manager The prebuilt ExecuTorch runtime, backend, and kernels are available as a [Swift PM](https://www.swift.org/documentation/package-manager/) package. diff --git a/docs/source/build-run-qualcomm-ai-engine-direct-backend.md b/docs/source/build-run-qualcomm-ai-engine-direct-backend.md index c774ae57b43..94a936b2e7a 100644 --- a/docs/source/build-run-qualcomm-ai-engine-direct-backend.md +++ b/docs/source/build-run-qualcomm-ai-engine-direct-backend.md @@ -59,9 +59,7 @@ This example is verified with SM8550 and SM8450. - Click the "Get Software" button to download a version of QNN SDK. - However, at the moment of updating this tutorial, the above website doesn't provide QNN SDK newer than 2.22.6. - The below is public links to download various QNN versions. Hope they can be publicly discoverable soon. - - [QNN 2.25.0](https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.25.0.240728.zip) - - [QNN 2.24.0](https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.24.0.240626.zip) - - [QNN 2.23.0](https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.23.0.24.06.24.zip) + - [QNN 2.26.0](https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.26.0.240828.zip) The directory with installed Qualcomm AI Engine Direct SDK looks like: ``` @@ -126,16 +124,17 @@ Python APIs on x64 are required to compile models to Qualcomm AI Engine Direct b ```bash cd $EXECUTORCH_ROOT -mkdir cmake-out -cd cmake-out +mkdir build-x86 +cd build-x86 # Note that the below command might change. # Please refer to the above build.sh for latest workable commands. cmake .. \ -DCMAKE_INSTALL_PREFIX=$PWD \ -DEXECUTORCH_BUILD_QNN=ON \ -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \ - -DEXECUTORCH_BUILD_SDK=ON \ + -DEXECUTORCH_BUILD_DEVTOOLS=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ -DPYTHON_EXECUTABLE=python3 \ -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF @@ -158,15 +157,16 @@ Commands to build `qnn_executor_runner` for Android: ```bash cd $EXECUTORCH_ROOT -mkdir cmake-out-android -cd cmake-out-android +mkdir build-android +cd build-android # build executorch & qnn_executorch_backend cmake .. \ -DCMAKE_INSTALL_PREFIX=$PWD \ -DEXECUTORCH_BUILD_QNN=ON \ -DQNN_SDK_ROOT=$QNN_SDK_ROOT \ - -DEXECUTORCH_BUILD_SDK=ON \ + -DEXECUTORCH_BUILD_DEVTOOLS=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ -DPYTHON_EXECUTABLE=python3 \ -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \ @@ -189,7 +189,7 @@ cmake ../examples/qualcomm \ cmake --build examples/qualcomm -j$(nproc) # qnn_executor_runner can be found under examples/qualcomm -# The full path is $EXECUTORCH_ROOT/cmake-out-android/examples/qualcomm/qnn_executor_runner +# The full path is $EXECUTORCH_ROOT/build-android/examples/qualcomm/qnn_executor_runner ls examples/qualcomm ``` @@ -209,7 +209,7 @@ cd $EXECUTORCH_ROOT cp schema/program.fbs exir/_serialize/program.fbs cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs -python -m examples.qualcomm.scripts.deeplab_v3 -b cmake-out-android -m SM8550 --compile_only --download +python -m examples.qualcomm.scripts.deeplab_v3 -b build-android -m SM8550 --compile_only --download ``` You might see something like below: @@ -239,7 +239,7 @@ We can test model inferences before deploying it to a device by HTP emulator. Let's build `qnn_executor_runner` for a x64 host: ```bash # assuming the AOT component is built. -cd $EXECUTORCH_ROOT/cmake-out +cd $EXECUTORCH_ROOT/build-x86 cmake ../examples/qualcomm \ -DCMAKE_PREFIX_PATH="$PWD/lib/cmake/ExecuTorch;$PWD/third-party/gflags;" \ -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \ @@ -249,14 +249,14 @@ cmake ../examples/qualcomm \ cmake --build examples/qualcomm -j$(nproc) # qnn_executor_runner can be found under examples/qualcomm -# The full path is $EXECUTORCH_ROOT/cmake-out/examples/qualcomm/qnn_executor_runner +# The full path is $EXECUTORCH_ROOT/build-x86/examples/qualcomm/qnn_executor_runner ls examples/qualcomm/ ``` To run the HTP emulator, the dynamic linker need to access QNN libraries and `libqnn_executorch_backend.so`. We set the below two paths to `LD_LIBRARY_PATH` environment variable: 1. `$QNN_SDK_ROOT/lib/x86_64-linux-clang/` - 2. `$EXECUTORCH_ROOT/cmake-out/lib/` + 2. `$EXECUTORCH_ROOT/build-x86/lib/` The first path is for QNN libraries including HTP emulator. It has been configured in the AOT compilation section. @@ -264,8 +264,8 @@ The second path is for `libqnn_executorch_backend.so`. So, we can run `./deeplab_v3/dlv3_qnn.pte` by: ```bash -cd $EXECUTORCH_ROOT/cmake-out -export LD_LIBRARY_PATH=$EXECUTORCH_ROOT/cmake-out/lib/:$LD_LIBRARY_PATH +cd $EXECUTORCH_ROOT/build-x86 +export LD_LIBRARY_PATH=$EXECUTORCH_ROOT/build-x86/lib/:$LD_LIBRARY_PATH examples/qualcomm/qnn_executor_runner --model_path ../deeplab_v3/dlv3_qnn.pte ``` @@ -308,8 +308,8 @@ So, we can run `qnn_executor_runner` like ```bash adb push ./deeplab_v3/dlv3_qnn.pte ${DEVICE_DIR} -adb push ${EXECUTORCH_ROOT}/cmake-out-android/examples/qualcomm/executor_runner/qnn_executor_runner ${DEVICE_DIR} -adb push ${EXECUTORCH_ROOT}/cmake-out-android/lib/libqnn_executorch_backend.so ${DEVICE_DIR} +adb push ${EXECUTORCH_ROOT}/build-android/examples/qualcomm/executor_runner/qnn_executor_runner ${DEVICE_DIR} +adb push ${EXECUTORCH_ROOT}/build-android/lib/libqnn_executorch_backend.so ${DEVICE_DIR} adb shell "cd ${DEVICE_DIR} \ && export LD_LIBRARY_PATH=${DEVICE_DIR} \ && export ADSP_LIBRARY_PATH=${DEVICE_DIR} \ @@ -333,7 +333,7 @@ I 00:00:00.364875 executorch:qnn_executor_runner.cpp:425] Write etdump to etdump The model is merely executed. If we want to feed real inputs and get model outputs, we can use ```bash cd $EXECUTORCH_ROOT -python -m examples.qualcomm.scripts.deeplab_v3 -b cmake-out-android -m SM8550 --download -s +python -m examples.qualcomm.scripts.deeplab_v3 -b build-android -m SM8550 --download -s ``` The `` can be found by `adb devices` command. @@ -354,7 +354,7 @@ Please refer to `$EXECUTORCH_ROOT/examples/qualcomm/scripts/` and `EXECUTORCH_RO ## What is coming? - - [llama2 and llama3](https://github.com/pytorch/executorch/pull/4030). Note that at the moment of writing, we still suffer from the quantization issue in llama2-7B and llama3-8B cases. Only storiesllama works well. + - Improve the performance for llama3-8B-Instruct and support batch prefill. - We will support pre-compiled binaries from [Qualcomm AI Hub](https://aihub.qualcomm.com/). ## FAQ diff --git a/docs/source/compiler-delegate-and-partitioner.md b/docs/source/compiler-delegate-and-partitioner.md index fa41ec93c9d..c82af7d98fe 100644 --- a/docs/source/compiler-delegate-and-partitioner.md +++ b/docs/source/compiler-delegate-and-partitioner.md @@ -127,13 +127,13 @@ static auto success_with_compiler = register_backend(backend); ``` -## SDK Integration: Debuggability +## Developer Tools Integration: Debuggability -Providing consistent debugging experience, be it for runtime failures or performance profiling, is important. ExecuTorch employs native SDK (Software Development Kit) for this purpose, which enables correlating program instructions to original PyTorch code, via debug handles. You can read more about it [here](./sdk-etrecord). +Providing consistent debugging experience, be it for runtime failures or performance profiling, is important. ExecuTorch employs native Developer Tools for this purpose, which enables correlating program instructions to original PyTorch code, via debug handles. You can read more about it [here](./sdk-etrecord). -Delegated program or subgraphs are opaque to ExecuTorch runtime and appear as a special `call_delegate` instruction, which asks corresponding backend to handle the execution of the subgraph or program. Due to the opaque nature of backend delgates, native SDK does not have visibility into delegated program. Thus the debugging, functional or performance, experiences of delegated execution suffers significantly as compared to it's non-delegated counterpart. +Delegated program or subgraphs are opaque to ExecuTorch runtime and appear as a special `call_delegate` instruction, which asks corresponding backend to handle the execution of the subgraph or program. Due to the opaque nature of backend delgates, native Developer Tools does not have visibility into delegated program. Thus the debugging, functional or performance, experiences of delegated execution suffers significantly as compared to it's non-delegated counterpart. -In order to provide consistent debugging experience to users, regardless of the use of delegation for a model, SDK provides an interface to correlate delegated (sub)graph to original (sub)graph. The SDK does so via debug handles map which allows delegates to generate internal handles that can be associated with the original (sub)graph consumed by the delegate. Then at runtime, backend developer can report error or profiling information using the internal handle, which will be mapped to original (sub)graph using the debug handle map. For more information, please refer to [SDK delegate integration](./sdk-delegate-integration). +In order to provide consistent debugging experience to users, regardless of the use of delegation for a model, Developer Tools provide an interface to correlate delegated (sub)graph to original (sub)graph. The Developer Tools do so via debug handles map which allows delegates to generate internal handles that can be associated with the original (sub)graph consumed by the delegate. Then at runtime, backend developer can report error or profiling information using the internal handle, which will be mapped to original (sub)graph using the debug handle map. For more information, please refer to [Developer Tools Delegate Integration](./sdk-delegate-integration). By leveraging the debug identifier, backend developer can embed the debug as part of the delegated blob diff --git a/docs/source/compiler-memory-planning.md b/docs/source/compiler-memory-planning.md index 1dad3b032fc..fcad2eca58b 100644 --- a/docs/source/compiler-memory-planning.md +++ b/docs/source/compiler-memory-planning.md @@ -32,7 +32,6 @@ The `MemoryPlanningPass` exposes the option to not memory plan program inputs an program = edge_program.to_executorch( exir.ExecutorchBackendConfig( memory_planning_pass=MemoryPlanningPass( - memory_planning_algo="greedy", alloc_graph_input=False, # Inputs will not be memory planned, the data_ptr for input tensors after model load will be nullptr alloc_graph_output=True, # Outputs will be memory planned, the data_ptr for input tensors after model load will be in the `planned_memory`. ) @@ -77,7 +76,7 @@ Then later when lowering to ExecuTorch you can use your custom plan in the follo program = edge_program.to_executorch( exir.ExecutorchBackendConfig( memory_planning_pass=CustomPoolMemoryPlanningPass( - memory_planning_algo="greedy", + memory_planning_algo=greedy, ) ) ) diff --git a/docs/source/concepts.md b/docs/source/concepts.md index 33d944c376a..c085505b61a 100644 --- a/docs/source/concepts.md +++ b/docs/source/concepts.md @@ -283,9 +283,9 @@ Techniques for performing computations and memory accesses on tensors with lower The ExecuTorch runtime executes models on edge devices. It is responsible for program initialization, program execution and, optionally, destruction (releasing backend owned resources). -## [SDK](./sdk-overview.md) +## [Developer Tools](./devtools-overview.md) -Software Development Kit. The tooling users need to profile, debug and visualize programs that are running with ExecuTorch. +A collection of tools users need to profile, debug and visualize programs that are running with ExecuTorch. ## [Selective build](./kernel-library-selective-build.md) diff --git a/docs/source/devtools-overview.md b/docs/source/devtools-overview.md new file mode 100644 index 00000000000..13fd8e00597 --- /dev/null +++ b/docs/source/devtools-overview.md @@ -0,0 +1,44 @@ +# Introduction to the ExecuTorch Developer Tools + +ExecuTorch has been designed with [productivity](./intro-overview.md) as one of its core objectives and the ExecuTorch Developer Tools enable this through the comprehensive suite of tools it provides users to help them profile, debug, and visualize models that they have onboarded onto ExecuTorch. + +All the components of the Developer Tools have been designed from the ground up with deep integration in both the export process and the runtime. This enables us to provide unique features such as linking back operator execution in the runtime to the line of code in the original eager model that this operator originated from. + +## Developer Tools Features + +The ExecuTorch Developer Tools support the following features: + +- **BundledProgram** is a utility tool for exporting the model bundled with a sample set of (representative) inputs and expected outputs, so that during runtime users can validate that the actual output is in fact the same as the expected output. +- **Profiling** models with operator level breakdown of performance stats + - Linking back operator performance stats to source code and module hierarchy + - Model loading and execution time +- **Delegate Integration** - Surfacing performance details from delegate backends + - Link back delegate operator execution to the nodes they represent in the edge dialect graph (and subsequently linking back to source code and module hierarchy) +- **Debugging** - Intermediate outputs and output quality analysis +- **Visualization** - Coming soon + +## Fundamental components of the Developer Tools + +In order to fully understand and leverage the power of the Developer Tools in this section, the fundamental components that power the Developer Tools will be detailed. + +### ETRecord +ETRecord (ExecuTorch Record) is an artifact generated during the export process that stores the graphs and other metadata that is critical for the Developer Tools to be able to link back the performance/debug data sourced from the runtime to the source code of the eager model. + +To draw a rough equivalence to conventional software development ETRecord can be considered as the binary built with debug symbols that is used for debugging in GNU Project debugger (gdb). + +More details are available in the [ETRecord documentation](sdk-etrecord.rst) on how to generate and store an ETRecord. + +### ETDump +ETDump (ExecuTorch Dump) is the binary blob that is generated by the runtime after running a model. Similarly as above, to draw a rough equivalence to conventional software development, ETDump can be considered as the coredump of ExecuTorch, but in this case within ETDump we store all the performance and debug data that was generated by the runtime during model execution. + +```{note} +If you only care about looking at the raw performance data without linking back to source code and other extensive features, an ETDump alone will be enough to leverage the basic features of the Developer Tools. For the full experience, it is recommended that the users also generate an ETRecord. +``` + +More details are available in the [ETDump documentation](sdk-etdump.md) on how to generate and store an ETDump from the runtime. + + +### Inspector APIs +The Inspector Python APIs are the main user enrty point into the Developer Tools. They join the data sourced from ETDump and ETRecord to give users access to all the performance and debug data sourced from the runtime along with linkage back to eager model source code and module hierarchy in an easy to use API. + +More details are available in the [Inspector API documentation](sdk-inspector.rst) on how to use the Inspector APIs. diff --git a/docs/source/devtools-tutorial.md b/docs/source/devtools-tutorial.md new file mode 100644 index 00000000000..33d78cf58da --- /dev/null +++ b/docs/source/devtools-tutorial.md @@ -0,0 +1,3 @@ +## Developer Tools Usage Tutorial + +Please refer to the [Developer Tools tutorial](./tutorials/devtools-integration-tutorial) for a walkthrough on how to profile a model in ExecuTorch using the Developer Tools. diff --git a/docs/source/extension-module.md b/docs/source/extension-module.md index 9e236e8e489..7516184d1cc 100644 --- a/docs/source/extension-module.md +++ b/docs/source/extension-module.md @@ -22,7 +22,7 @@ Tensor::SizesType sizes[] = {1, 3, 256, 256}; TensorImpl tensor(ScalarType::Float, std::size(sizes), sizes, input); // Perform an inference. -const auto result = module.forward({EValue(Tensor(&tensor))}); +const auto result = module.forward(Tensor(&tensor)); // Check for success or failure. if (result.ok()) { @@ -105,13 +105,13 @@ Note: `method_meta()` will try to force-load the `Method` when called for the fi Assuming that the `Program`'s method names and their input format is known ahead of time, we rarely need to query for those and can run the methods directly by name using the `execute()` function: ```cpp -const auto result = module.execute("forward", {EValue(Tensor(&tensor))}); +const auto result = module.execute("forward", Tensor(&tensor)); ``` Which can also be simplified for the standard `forward()` method name as: ```cpp -const auto result = module.forward({EValue(Tensor(&tensor))}); +const auto result = module.forward(Tensor(&tensor)); ``` Note: `execute()` or `forward()` will try to force load the `Program` and the `Method` when called for the first time. Therefore, the first inference will take more time than subsequent ones as it loads the model lazily and prepares it for execution unless the `Program` or `Method` was loaded explicitly earlier using the corresponding functions. @@ -132,7 +132,7 @@ Use [ExecuTorch Dump](sdk-etdump.md) to trace model execution. Create an instanc #include #include #include -#include +#include using namespace ::torch::executor; diff --git a/docs/source/getting-started-architecture.md b/docs/source/getting-started-architecture.md index 2c3f85aff17..937b5b389f5 100644 --- a/docs/source/getting-started-architecture.md +++ b/docs/source/getting-started-architecture.md @@ -87,8 +87,8 @@ The ExecuTorch runtime is written in C++ with minimal dependencies for portabili _Executor_ is the entry point to load the program and execute it. The execution triggers corresponding operator kernels or backend execution from this very minimal runtime. -## SDK +## Developer Tools -It should be efficient for users to go from research to production using the flow above. Productivity is essentially important, for users to author, optimize and deploy their models. We provide [ExecuTorch SDK](./sdk-overview.md) to improve productivity. The SDK is not in the diagram. Instead it's a tool set that covers the developer workflow in all three phases. +It should be efficient for users to go from research to production using the flow above. Productivity is essentially important, for users to author, optimize and deploy their models. We provide [ExecuTorch Developer Tools](./devtools-overview.md) to improve productivity. The Developer Tools are not in the diagram. Instead it's a tool set that covers the developer workflow in all three phases. -During the program preparation and execution, users can use the ExecuTorch SDK to profile, debug, or visualize the program. Since the end-to-end flow is within the PyTorch ecosystem, users can correlate and display performance data along with graph visualization as well as direct references to the program source code and model hierarchy. We consider this to be a critical component for quickly iterating and lowering PyTorch programs to edge devices and environments. +During the program preparation and execution, users can use the ExecuTorch Developer Tools to profile, debug, or visualize the program. Since the end-to-end flow is within the PyTorch ecosystem, users can correlate and display performance data along with graph visualization as well as direct references to the program source code and model hierarchy. We consider this to be a critical component for quickly iterating and lowering PyTorch programs to edge devices and environments. diff --git a/docs/source/getting-started-setup.md b/docs/source/getting-started-setup.md index d610f020ef2..15fa084e33f 100644 --- a/docs/source/getting-started-setup.md +++ b/docs/source/getting-started-setup.md @@ -59,13 +59,11 @@ also work in similar environments. - We recommend `conda` as it provides cross-language support and integrates smoothly with `pip` (Python's built-in package manager) - Otherwise, Python's built-in virtual environment manager `python venv` is a good alternative. -* `g++` version 8 or higher, `clang++` version 8 or higher, or another - C++17-compatible toolchain that supports GNU C-style [statement - expressions](https://gcc.gnu.org/onlinedocs/gcc/Statement-Exprs.html) (`({ ... - })` syntax). +* `g++` version 7 or higher, `clang++` version 5 or higher, or another + C++17-compatible toolchain. Note that the cross-compilable core runtime code supports a wider range of -toolchains, down to C++11. See the [Runtime Overview](./runtime-overview.md) for +toolchains, down to C++17. See the [Runtime Overview](./runtime-overview.md) for portability details. ## Quick Setup: Colab/Jupyter Notebook Prototype @@ -112,6 +110,23 @@ Alternatively, if you would like to experiment with ExecuTorch quickly and easil ``` After setting up your environment, you are ready to convert your PyTorch programs to ExecuTorch. + +> **_NOTE:_** Cleaning the build system +> +> When fetching a new version of the upstream repo (via `git fetch` or `git +> pull`) it is a good idea to clean the old build artifacts. The build system +> does not currently adapt well to changes in build dependencies. +> +> You should also update and pull the submodules again, in case their versions +> have changed. +> +> ```bash +> # From the root of the executorch repo: +> rm -rf cmake-out pip-out +> git submodule sync +> git submodule update --init +> ``` + ## Create an ExecuTorch program After setting up your environment, you are ready to convert your PyTorch programs @@ -171,13 +186,30 @@ For now, let's use [`executor_runner`](https://github.com/pytorch/executorch/blo ### Build Tooling Setup The ExecuTorch repo uses CMake to build its C++ code. Here, we'll configure it to build the `executor_runner` tool to run it on our desktop OS. ```bash - # Clean and configure the CMake build system. Compiled programs will appear in the executorch/cmake-out directory we create here. + # Clean and configure the CMake build system. Compiled programs will + # appear in the executorch/cmake-out directory we create here. (rm -rf cmake-out && mkdir cmake-out && cd cmake-out && cmake ..) # Build the executor_runner target cmake --build cmake-out --target executor_runner -j9 ``` +> **_NOTE:_** Cleaning the build system +> +> When fetching a new version of the upstream repo (via `git fetch` or `git +> pull`) it is a good idea to clean the old build artifacts. The build system +> does not currently adapt well to changes in build dependencies. +> +> You should also update and pull the submodules again, in case their versions +> have changed. +> +> ```bash +> # From the root of the executorch repo: +> rm -rf cmake-out pip-out +> git submodule sync +> git submodule update --init +> ``` + ### Run Your Program Now that we've exported a program and built the runtime, let's execute it! diff --git a/docs/source/index.rst b/docs/source/index.rst index d8955c513e4..d49fd43e31b 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -94,7 +94,7 @@ Topics in this section will help you get started with ExecuTorch. tutorials/export-to-executorch-tutorial running-a-model-cpp-tutorial extension-module - tutorials/sdk-integration-tutorial + tutorials/devtools-integration-tutorial apple-runtime demo-apps-ios demo-apps-android @@ -117,6 +117,9 @@ Topics in this section will help you get started with ExecuTorch. :hidden: llm/getting-started + llm/llama-demo-android + llm/build-run-llama3-qualcomm-ai-engine-direct-backend + llm/llama-demo-ios .. toctree:: :glob: @@ -193,10 +196,10 @@ Topics in this section will help you get started with ExecuTorch. .. toctree:: :glob: :maxdepth: 1 - :caption: SDK + :caption: Developer Tools :hidden: - sdk-overview + devtools-overview sdk-bundled-io sdk-etrecord sdk-etdump @@ -204,7 +207,7 @@ Topics in this section will help you get started with ExecuTorch. sdk-debugging sdk-inspector sdk-delegate-integration - sdk-tutorial + devtools-tutorial .. toctree:: :glob: @@ -244,11 +247,11 @@ ExecuTorch tutorials. :tags: .. customcarditem:: - :header: Using the ExecuTorch SDK to Profile a Model - :card_description: A tutorial for using the ExecuTorch SDK to profile and analyze a model with linkage back to source code. + :header: Using the ExecuTorch Developer Tools to Profile a Model + :card_description: A tutorial for using the ExecuTorch Developer Tools to profile and analyze a model with linkage back to source code. :image: _static/img/generic-pytorch-logo.png - :link: tutorials/sdk-integration-tutorial.html - :tags: SDK + :link: tutorials/devtools-integration-tutorial.html + :tags: devtools .. customcarditem:: :header: Integrating and Running ExecuTorch on Apple Platforms diff --git a/docs/source/intro-overview.md b/docs/source/intro-overview.md index f80caff4679..96c7982b8fe 100644 --- a/docs/source/intro-overview.md +++ b/docs/source/intro-overview.md @@ -10,9 +10,9 @@ Key value propositions of ExecuTorch are: - **Portability:** Compatibility with a wide variety of computing platforms, from high-end mobile phones to highly constrained embedded systems and microcontrollers. -- **Productivity:** Enabling developers to use the same toolchains and SDK from - PyTorch model authoring and conversion, to debugging and deployment to a wide - variety of platforms. +- **Productivity:** Enabling developers to use the same toolchains and Developer + Tools from PyTorch model authoring and conversion, to debugging and deployment + to a wide variety of platforms. - **Performance:** Providing end users with a seamless and high-performance experience due to a lightweight runtime and utilizing full hardware capabilities such as CPUs, NPUs, and DSPs. diff --git a/docs/source/kernel-library-custom-aten-kernel.md b/docs/source/kernel-library-custom-aten-kernel.md index 8fb4ed96cd5..0f060d1c5e5 100644 --- a/docs/source/kernel-library-custom-aten-kernel.md +++ b/docs/source/kernel-library-custom-aten-kernel.md @@ -3,23 +3,49 @@ At the last stage of [ExecuTorch model exporting](./export-overview.md), we lower the operators in the dialect to the _out variants_ of the [core ATen operators](./ir-ops-set-definition.md). Then we serialize these operator names into the model artifact. During runtime execution, for each operator name we will need to find the actual _kernels_, i.e., the C++ functions that do the heavy-lifting calculations and return results. -Portable kernel library is the in-house default kernel library, it’s easy to use and portable for most of the target backends. However it’s not optimized for performance, because it’s not specialized for any certain target. Therefore we provide kernel registration APIs for ExecuTorch users to easily register their own optimized kernels. +## Kernel Libraries +### First-party kernel libraries: +**[Portable kernel library](https://github.com/pytorch/executorch/tree/main/kernels/portable)** is the in-house default kernel library that covers most of the core ATen operators. It’s easy to use/read and is written in portable C++17. However it’s not optimized for performance, because it’s not specialized for any certain target. Therefore we provide kernel registration APIs for ExecuTorch users to easily register their own optimized kernels. -## Design Principles +**[Optimized kernel library](https://github.com/pytorch/executorch/tree/main/kernels/optimized)** specializes on performance for some of the operators, leveraging existing third party libraries such as [EigenBLAS](https://gitlab.com/libeigen/eigen). This works best along with the portable kernel library, with a good balance on portability and performance. One example of combining these two libraries can be found [here](https://github.com/pytorch/executorch/blob/main/configurations/CMakeLists.txt). -**What do we support?** On the operator coverage side, the kernel registration APIs allow users to register kernels for all core ATen ops as well as custom ops, as long as the custom ops schemas are specified. +**[Quantized kernel library](https://github.com/pytorch/executorch/tree/main/kernels/quantized)** implements operators for quantization and dequantization. These are out of core ATen operators but are vital to most of the production use cases. -Notice that we also support _partial kernels, _for example the kernel only supports a subset of tensor dtypes and/or dim orders. +### Custom kernel libraries: -**Kernel contract**: kernels need to comply with the following requirements: +**Custom kernels implementing core ATen ops**. Even though we don't have an internal example for custom kernels for core ATen ops, the optimized kernel library can be viewed as a good example. We have optimized [`add.out`](https://github.com/pytorch/executorch/blob/main/kernels/optimized/cpu/op_add.cpp) and a portable [`add.out`](https://github.com/pytorch/executorch/blob/main/kernels/portable/cpu/op_add.cpp). When user is combining these two libraries, we provide APIs to choose which kernel to use for `add.out`. In order to author and use custom kernels implementing core ATen ops, using the [YAML based approach](#yaml-entry-for-core-aten-op-out-variant) is recommended, because it provides full fledged support on + 1. combining kernel libraries and define fallback kernels; + 2. using selective build to minimize the kernel size. + +A **[Custom operator](https://github.com/pytorch/executorch/tree/main/extension/llm/custom_ops)** is any operator that an ExecuTorch user defines outside of PyTorch's [`native_functions.yaml`](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml). + +## Operator & Kernel Contract + +All the kernels mentioned above, whether they are in-house or customized, should comply with the following requirements: * Match the calling convention derived from operator schema. The kernel registration API will generate headers for the custom kernels as references. -* Satisfy the dtype constraints defined in edge dialect. For tensors with certain dtypes as arguments, the result of a custom kernel needs to match the expected dtypes. The constraints are available in edge dialect ops. -* Gives correct result. We will provide a testing framework to automatically test the custom kernels. +* Satisfy the dtype constraints defined in edge dialect. For tensors with certain dtypes as arguments, the result of a custom kernel needs to match the expected dtypes. The constraints are available in edge dialect ops. +* Give correct result. We will provide a testing framework to automatically test the custom kernels. + + +## APIs + +These are the APIs available to register kernels/custom kernels/custom ops into ExecuTorch: + +* [YAML Entry API](#yaml-entry-api-high-level-architecture) + - [for core ATen op with custom kernels](#yaml-entry-api-for-core-aten-op-out-variant) + - [for custom ops](#yaml-entry-api-for-custom-ops) + - [CMake Macros](#cmake-macros) +* C++ API + - [for custom ops](#c-api-for-custom-ops) + - [CMake Example](#compile-and-link-the-custom-kernel) + +If it's not clear which API to use, please see [Best Practices](#custom-ops-api-best-practices). + -## High Level Architecture +### YAML Entry API High Level Architecture ![](./_static/img/kernel-library-custom-aten-kernel.png) @@ -27,10 +53,10 @@ ExecuTorch users are asked to provide: 1. the custom kernel library with C++ implementations -2. a yaml file associated with the library that describes what operators are being implemented by this library. For partial kernels, the yaml file also contains information on the dtypes and dim orders supported by the kernel. More details in the API section. +2. a YAML file associated with the library that describes what operators are being implemented by this library. For partial kernels, the yaml file also contains information on the dtypes and dim orders supported by the kernel. More details in the API section. -### Workflow +### YAML Entry API Workflow At build time, the yaml files associated with kernel libraries will be passed to the _kernel resolver_ along with the model op info (see selective build doc) and the outcome is a mapping between a combination of operator names and tensor metadata, to kernel symbols. Then codegen tools will use this mapping to generate C++ bindings that connect the kernels to ExecuTorch runtime. ExecuTorch users need to link this generated library into their application to use these kernels. @@ -38,18 +64,10 @@ At static object initialization time, kernels will be registered into the ExecuT At runtime initialization stage, ExecuTorch will use the operator name and argument metadata as a key to lookup for the kernels. For example, with “aten::add.out” and inputs being float tensors with dim order (0, 1, 2, 3), ExecuTorch will go into the kernel registry and lookup for a kernel that matches the name and the input metadata. - -## APIs - -There are two sets of APIs: yaml files that describe kernel - operator mappings and codegen tools to consume these mappings. - - -### Yaml Entry for Core ATen Op Out Variant +### YAML Entry API for Core ATen Op Out Variant Top level attributes: - - * `op` (if the operator appears in `native_functions.yaml`) or `func` for custom operator. The value for this key needs to be the full operator name (including overload name) for `op` key, or a full operator schema (namespace, operator name, operator overload name and schema string), if we are describing a custom operator. For schema syntax please refer to this [instruction](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/README.md). * `kernels`: defines kernel information. It consists of `arg_meta` and `kernel_name`, which are bound together to describe "for input tensors with these metadata, use this kernel". * `type_alias`(optional): we are giving aliases to possible dtype options. `T0: [Double, Float]` means `T0` can be one of `Double` or `Float`. @@ -86,86 +104,9 @@ ATen operator with a dtype/dim order specialized kernel (works for `Double` dtyp kernel_name: torch::executor::add_out ``` -### Custom Ops C++ API - -For a custom kernel that implements a custom operator, we provides 2 ways to register it into ExecuTorch runtime: -1. Using `EXECUTORCH_LIBRARY` and `WRAP_TO_ATEN` C++ macros, covered by this section. -2. Using `functions.yaml` and codegen'd C++ libraries, covered by [next section](#custom-ops-yaml-entry). - -Please refer to [Custom Ops Best Practices](#custom-ops-api-best-practices) on which API to use. - -The first option requires C++17 and doesn't have selective build support yet, but it's faster than the second option where we have to go through yaml authoring and build system tweaking. - -The first option is particularly suitable for fast prototyping but can also be used in production. - -Similar to `TORCH_LIBRARY`, `EXECUTORCH_LIBRARY` takes the operator name and the C++ function name and register them into ExecuTorch runtime. - -#### Prepare custom kernel implementation - -Define your custom operator schema for both functional variant (used in AOT compilation) and out variant (used in ExecuTorch runtime). The schema needs to follow PyTorch ATen convention (see native_functions.yaml). For example: - -```yaml -custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor -custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!) -``` - -Then write your custom kernel according to the schema using ExecuTorch types, along with APIs to register to ExecuTorch runtime: - - -```c++ -// custom_linear.h/custom_linear.cpp -#include -Tensor& custom_linear_out(const Tensor& weight, const Tensor& input, optional bias, Tensor& out) { - // calculation - return out; -} -``` -#### Use a C++ macro to register it into PyTorch & ExecuTorch - -Append the following line in the example above: -```c++ -// custom_linear.h/custom_linear.cpp -// opset namespace myop -EXECUTORCH_LIBRARY(myop, "custom_linear.out", custom_linear_out); -``` - -Now we need to write some wrapper for this op to show up in PyTorch, but don’t worry we don’t need to rewrite the kernel. Create a separate .cpp for this purpose: - -```c++ -// custom_linear_pytorch.cpp -#include "custom_linear.h" -#include - -at::Tensor custom_linear(const at::Tensor& weight, const at::Tensor& input, std::optional bias) { - // initialize out - at::Tensor out = at::empty({weight.size(1), input.size(1)}); - // wrap kernel in custom_linear.cpp into ATen kernel - WRAP_TO_ATEN(custom_linear_out, 3)(weight, input, bias, out); - return out; -} -// standard API to register ops into PyTorch -TORCH_LIBRARY(myop, m) { - m.def("custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor", custom_linear); - m.def("custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!)", WRAP_TO_ATEN(custom_linear_out, 3)); -} -``` - -#### Compile and link the custom kernel - -Link it into ExecuTorch runtime: In our `CMakeLists.txt`` that builds the binary/application, we just need to add custom_linear.h/cpp into the binary target. We can build a dynamically loaded library (.so or .dylib) and link it as well. - -Link it into PyTorch runtime: We need to package custom_linear.h, custom_linear.cpp and custom_linear_pytorch.cpp into a dynamically loaded library (.so or .dylib) and load it into our python environment. One way of doing this is: - -```python -import torch -torch.ops.load_library("libcustom_linear.so/dylib") - -# Now we have access to the custom op, backed by kernel implemented in custom_linear.cpp. -op = torch.ops.myop.custom_linear.default -``` -### Custom Ops Yaml Entry +### YAML Entry API for Custom Ops As mentioned above, this option provides more support in terms of selective build and features such as merging operator libraries. @@ -215,14 +156,11 @@ ExecuTorch does not support all of the argument types that core PyTorch supports * List> * Optional> - -### Build Tool Macros +#### CMake Macros We provide build time macros to help users to build their kernel registration library. The macro takes the yaml file describing the kernel library as well as model operator metadata, and packages the generated C++ bindings into a C++ library. The macro is available on CMake. -#### CMake - `generate_bindings_for_kernels(FUNCTIONS_YAML functions_yaml CUSTOM_OPS_YAML custom_ops_yaml)` takes a yaml file for core ATen op out variants and also a yaml file for custom ops, generate C++ bindings for kernel registration. It also depends on the selective build artifact generated by `gen_selected_ops()`, see selective build doc for more information. Then `gen_operators_lib` will package those bindings to be a C++ library. As an example: ```cmake # SELECT_OPS_LIST: aten::add.out,aten::mm.out @@ -263,6 +201,103 @@ And out fallback: The merged yaml will have the entry in functions.yaml. +### C++ API for Custom Ops + +Unlike the YAML entry API, the C++ API only uses C++ macros `EXECUTORCH_LIBRARY` and `WRAP_TO_ATEN` for kernel registration, also without selective build support. It makes this API faster in terms of development speed, since users don't have to do YAML authoring and build system tweaking. + +Please refer to [Custom Ops Best Practices](#custom-ops-api-best-practices) on which API to use. + +Similar to [`TORCH_LIBRARY`](https://pytorch.org/cppdocs/library.html#library_8h_1a0bd5fb09d25dfb58e750d712fc5afb84) in PyTorch, `EXECUTORCH_LIBRARY` takes the operator name and the C++ function name and register them into ExecuTorch runtime. + +#### Prepare custom kernel implementation + +Define your custom operator schema for both functional variant (used in AOT compilation) and out variant (used in ExecuTorch runtime). The schema needs to follow PyTorch ATen convention (see `native_functions.yaml`). For example: + +```yaml +custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor +custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!) +``` + +Then write your custom kernel according to the schema using ExecuTorch types, along with APIs to register to ExecuTorch runtime: + + +```c++ +// custom_linear.h/custom_linear.cpp +#include +Tensor& custom_linear_out(const Tensor& weight, const Tensor& input, optional bias, Tensor& out) { + // calculation + return out; +} +``` +#### Use a C++ macro to register it into ExecuTorch + +Append the following line in the example above: +```c++ +// custom_linear.h/custom_linear.cpp +// opset namespace myop +EXECUTORCH_LIBRARY(myop, "custom_linear.out", custom_linear_out); +``` + +Now we need to write some wrapper for this op to show up in PyTorch, but don’t worry we don’t need to rewrite the kernel. Create a separate .cpp for this purpose: + +```c++ +// custom_linear_pytorch.cpp +#include "custom_linear.h" +#include + +at::Tensor custom_linear(const at::Tensor& weight, const at::Tensor& input, std::optional bias) { + // initialize out + at::Tensor out = at::empty({weight.size(1), input.size(1)}); + // wrap kernel in custom_linear.cpp into ATen kernel + WRAP_TO_ATEN(custom_linear_out, 3)(weight, input, bias, out); + return out; +} +// standard API to register ops into PyTorch +TORCH_LIBRARY(myop, m) { + m.def("custom_linear(Tensor weight, Tensor input, Tensor(?) bias) -> Tensor", custom_linear); + m.def("custom_linear.out(Tensor weight, Tensor input, Tensor(?) bias, *, Tensor(a!) out) -> Tensor(a!)", WRAP_TO_ATEN(custom_linear_out, 3)); +} +``` + +#### Compile and link the custom kernel + +Link it into ExecuTorch runtime: In our `CMakeLists.txt` that builds the binary/application, we need to add custom_linear.h/cpp into the binary target. We can build a dynamically loaded library (.so or .dylib) and link it as well. + +Here's an example to do it: + +```cmake +# For target_link_options_shared_lib +include(${EXECUTORCH_ROOT}/build/Utils.cmake) + +# Add a custom op library +add_library(custom_op_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/custom_op.cpp) + +# Include the header +target_include_directory(custom_op_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include) + +# Link ExecuTorch library +target_link_libraries(custom_op_lib PUBLIC executorch) + +# Define a binary target +add_executable(custom_op_runner PUBLIC main.cpp) + +# Link this library with --whole-archive !! IMPORTANT !! this is to avoid the operators being stripped by linker +target_link_options_shared_lib(custom_op_lib) + +# Link custom op lib +target_link_libraries(custom_op_runner PUBLIC custom_op_lib) + +``` + +Link it into the PyTorch runtime: We need to package custom_linear.h, custom_linear.cpp and custom_linear_pytorch.cpp into a dynamically loaded library (.so or .dylib) and load it into our python environment. One way of doing this is: + +```python +import torch +torch.ops.load_library("libcustom_linear.so/dylib") + +# Now we have access to the custom op, backed by kernel implemented in custom_linear.cpp. +op = torch.ops.myop.custom_linear.default +``` ### Custom Ops API Best Practices diff --git a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md new file mode 100644 index 00000000000..ac95fb21bd8 --- /dev/null +++ b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md @@ -0,0 +1,128 @@ +# Building and Running Llama 3 8B Instruct with Qualcomm AI Engine Direct Backend + +This tutorial demonstrates how to export Llama 3 8B Instruct for Qualcomm AI Engine Direct Backend and running the model on a Qualcomm device. + +## Prerequisites + +- Set up your ExecuTorch repo and environment if you haven’t done so by following [the Setting up ExecuTorch](../getting-started-setup.md) to set up the repo and dev environment. +- Read [the Building and Running ExecuTorch with Qualcomm AI Engine Direct Backend page](../build-run-qualcomm-ai-engine-direct-backend.md) to understand how to export and run a model with Qualcomm AI Engine Direct Backend on Qualcomm device. +- Follow [the README for executorch llama](https://github.com/pytorch/executorch/tree/main/examples/models/llama2) to know how to run a llama model on mobile via ExecuTorch. +- A Qualcomm device with 16GB RAM + - We are continuing to optimize our memory usage to ensure compatibility with lower memory devices. +- The version of [Qualcomm AI Engine Direct SDK](https://developer.qualcomm.com/software/qualcomm-ai-engine-direct-sdk) is 2.26.0 or above. + +## Instructions + +### Step1: Prepare the checkpoint of the model and optimized matrix from [Spin Quant](https://github.com/facebookresearch/SpinQuant) + +1. For Llama 3 tokenizer and checkpoint, please refer to https://github.com/meta-llama/llama-models/blob/main/README.md for further instructions on how to download `tokenizer.model`, `consolidated.00.pth` and `params.json`. +2. To get the optimized matrix, please refer to [SpinQuant on GitHub](https://github.com/facebookresearch/SpinQuant). You can download the optimized rotation matrices in the Quantized Models section. Please choose **LLaMA-3-8B/8B_W4A16KV16_lr_1.5_seed_0**. + +### Step2: Export to ExecuTorch with Qualcomm AI Engine Direct Backend +Deploying large language models like Llama 3 on-device presents the following challenges: + +1. The model size is too large to fit in device memory for inference. +2. High model loading and inference time. +3. Difficulty in quantization. + +To address these challenges, we have implemented the following solutions: +1. Using `--pt2e_quantize qnn_16a4w` to quantize activations and weights, thereby reducing the on-disk model size and alleviating memory pressure during inference. +2. Using `--num_sharding 8` to shard the model into sub-parts. +3. Performing graph transformations to convert or decompose operations into more accelerator-friendly operations. +4. Using `--optimized_rotation_path ` to apply R1 and R2 of [Spin Quant](https://github.com/facebookresearch/SpinQuant) to improve accuracy. +5. Using `--calibration_data "<|start_header_id|>system<|end_header_id|..."` to ensure that during the quantization of Llama 3 8B instruct, the calibration includes special tokens in the prompt template. For more details on the prompt template, refer to [the model card of meta llama3 instruct](https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/). + +To export Llama 3 8B instruct with the Qualcomm AI Engine Direct Backend, ensure the following: + +1. The host machine has more than 100GB of memory (RAM + swap space). +2. The entire process takes a few hours. + +```bash +# Please note that calibration_data must include the prompt template for special tokens. +python -m examples.models.llama2.export_llama -t +llama3/Meta-Llama-3-8B-Instruct/tokenizer.model -p -c --use_kv_cache --qnn --pt2e_quantize qnn_16a4w --disable_dynamic_shape --num_sharding 8 --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --optimized_rotation_path --calibration_data "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" +``` + +### Step3: Invoke the Runtime on an Android smartphone with Qualcomm SoCs +1. Build executorch with Qualcomm AI Engine Direct Backend for android + ```bash + cmake \ + -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake" \ + -DANDROID_ABI=arm64-v8a \ + -DANDROID_PLATFORM=android-23 \ + -DCMAKE_INSTALL_PREFIX=cmake-android-out \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_BUILD_QNN=ON \ + -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -Bcmake-android-out . + + cmake --build cmake-android-out -j16 --target install --config Release + ``` +2. Build llama runner for android +```bash + cmake \ + -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK_ROOT}"/build/cmake/android.toolchain.cmake \ + -DANDROID_ABI=arm64-v8a \ + -DANDROID_PLATFORM=android-23 \ + -DCMAKE_INSTALL_PREFIX=cmake-android-out \ + -DCMAKE_BUILD_TYPE=Release -DPYTHON_EXECUTABLE=python \ + -DEXECUTORCH_BUILD_QNN=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -Bcmake-android-out/examples/models/llama2 examples/models/llama2 + + cmake --build cmake-android-out/examples/models/llama2 -j16 --config Release +``` +3. Run on Android via adb shell +*Pre-requisite*: Make sure you enable USB debugging via developer options on your phone + +**3.1 Connect your android phone** + +**3.2 We need to push required QNN libraries to the device.** +```bash +# make sure you have write-permission on below path. +DEVICE_DIR=/data/local/tmp/llama +adb shell mkdir -p ${DEVICE_DIR} +adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtp.so ${DEVICE_DIR} +adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnSystem.so ${DEVICE_DIR} +adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV69Stub.so ${DEVICE_DIR} +adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV73Stub.so ${DEVICE_DIR} +adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV75Stub.so ${DEVICE_DIR} +adb push ${QNN_SDK_ROOT}/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so ${DEVICE_DIR} +adb push ${QNN_SDK_ROOT}/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so ${DEVICE_DIR} +adb push ${QNN_SDK_ROOT}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${DEVICE_DIR} +``` + +**3.3 Upload model, tokenizer and llama runner binary to phone** +```bash +adb push ${DEVICE_DIR} +adb push ${DEVICE_DIR} +adb push cmake-android-out/lib/libqnn_executorch_backend.so ${DEVICE_DIR} +adb push cmake-out-android/examples/models/llama2/llama_main ${DEVICE_DIR} +``` + +**3.4 Run model** +```bash +adb shell "cd ${DEVICE_DIR} && ./llama_main --model_path --tokenizer_path --prompt \"<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n\" --seq_len 128" +``` +You should see the message: +``` +<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello! I'd be delighted to chat with you about Facebook. Facebook is a social media platform that was created in 2004 by Mark Zuckerberg and his colleagues while he was a student at Harvard University. It was initially called "Facemaker" but later changed to Facebook, which is a combination of the words "face" and "book". The platform was initially intended for people to share their thoughts and share information with their friends, but it quickly grew to become one of the +``` + +## What is coming? +- Improve the performance for Llama 3 Instruct +- Reduce the memory pressure during inference to support 12GB Qualcomm devices +- Support more LLMs + +## FAQ + +If you encounter any issues while reproducing the tutorial, please file a github +issue on ExecuTorch repo and tag use `#qcom_aisw` tag diff --git a/docs/source/llm/getting-started.md b/docs/source/llm/getting-started.md index 5fffb7e8caf..771bf489a94 100644 --- a/docs/source/llm/getting-started.md +++ b/docs/source/llm/getting-started.md @@ -198,25 +198,21 @@ Create a file called main.cpp with the following contents: // main.cpp #include -#include -#include -#include -#include "basic_tokenizer.h" #include "basic_sampler.h" -#include "managed_tensor.h" +#include "basic_tokenizer.h" #include -#include +#include +#include #include -#include -#include - -using namespace torch::executor; +#include -using SizesType = exec_aten::SizesType; -using DimOrderType = exec_aten::DimOrderType; -using StridesType = exec_aten::StridesType; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using executorch::extension::Module; +using executorch::runtime::EValue; +using executorch::runtime::Result; ``` The model inputs and outputs take the form of tensors. A tensor can be thought of as an multi-dimensional array. @@ -248,14 +244,13 @@ std::string generate( for (auto i = 0u; i < max_output_length; i++) { // Convert the input_tokens from a vector of int64_t to EValue. // EValue is a unified data type in the ExecuTorch runtime. - ManagedTensor tensor_tokens( + auto inputs = from_blob( input_tokens.data(), {1, static_cast(input_tokens.size())}, ScalarType::Long); - std::vector inputs = {tensor_tokens.get_tensor()}; // Run the model. It will return a tensor of logits (log-probabilities). - Result> logits_evalue = llm_model.forward(inputs); + auto logits_evalue = llm_model.forward(inputs); // Convert the output logits from EValue to std::vector, which is what // the sampler expects. @@ -343,7 +338,6 @@ Finally, download the following files into the same directory as main.h: ``` curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/basic_sampler.h curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/basic_tokenizer.h -curl -O https://raw.githubusercontent.com/pytorch/executorch/main/examples/llm_manual/managed_tensor.h ``` To learn more, see the [Runtime APIs Tutorial](../extension-module.md). @@ -368,6 +362,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED True) # Set options for executorch build. option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON) option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON) +option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON) option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON) # Include the executorch subdirectory. @@ -381,6 +376,7 @@ target_link_libraries( PRIVATE executorch extension_module_static # Provides the Module class + extension_tensor # Provides the TensorPtr class optimized_native_cpu_ops_lib) # Provides baseline cross-platform kernels ``` @@ -390,7 +386,6 @@ At this point, the working directory should contain the following files: - main.cpp - basic_tokenizer.h - basic_sampler.h -- managed_tensor.h - export_nanogpt.py - model.py - vocab.json @@ -522,6 +517,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED True) # Set options for executorch build. option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON) option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON) +option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON) option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON) option(EXECUTORCH_BUILD_XNNPACK "" ON) # Build with Xnnpack backend @@ -538,6 +534,7 @@ target_link_libraries( PRIVATE executorch extension_module_static # Provides the Module class + extension_tensor # Provides the TensorPtr class optimized_native_cpu_ops_lib # Provides baseline cross-platform kernels xnnpack_backend) # Provides the XNNPACK CPU acceleration backend ``` @@ -552,7 +549,6 @@ At this point, the working directory should contain the following files: - main.cpp - basic_tokenizer.h - basic_sampler.h -- managed_tensor.h - export_nanogpt.py - model.py - vocab.json @@ -591,8 +587,8 @@ I'm not sure if you've heard of the "Curse of the Dragon" or not, but it's a ver The delegated model should be noticeably faster compared to the non-delegated model. For more information regarding backend delegateion, see the ExecuTorch guides -for the [XNNPACK Backend](../tutorial-xnnpack-delegate-lowering.md) and [Core ML -Backend](../build-run-coreml.md). +for the [XNNPACK Backend](../tutorial-xnnpack-delegate-lowering.md), [Core ML +Backend](../build-run-coreml.md) and [Qualcomm AI Engine Direct Backend](build-run-llama3-qualcomm-ai-engine-direct-backend.md). ## Quantization @@ -750,7 +746,7 @@ In the fragment of the output for nanoGPT below, observe that embedding and add ### Performance Analysis -Through the ExecuTorch SDK, users are able to profile model execution, giving timing information for each operator in the model. +Through the ExecuTorch Developer Tools, users are able to profile model execution, giving timing information for each operator in the model. #### Prerequisites @@ -763,7 +759,7 @@ In your export script, after calling `to_edge()` and `to_executorch()`, call `ge ``` import copy -from executorch.sdk import generate_etrecord +from executorch.devtools import generate_etrecord # Make the deep copy immediately after to to_edge() edge_manager_copy = copy.deepcopy(edge_manager) @@ -784,7 +780,7 @@ Include the ETDump header in your code. ```cpp // main.cpp -#include +#include ``` Create an Instance of the ETDumpGen class and pass it to the Module constructor. @@ -809,10 +805,10 @@ if (result.buf != nullptr && result.size > 0) { } ``` -Additionally, update CMakeLists.txt to build with SDK and enable events to be traced and logged into ETDump: +Additionally, update CMakeLists.txt to build with Developer Tools and enable events to be traced and logged into ETDump: ``` -option(EXECUTORCH_BUILD_SDK "" ON) +option(EXECUTORCH_BUILD_DEVTOOLS "" ON) # ... @@ -835,7 +831,7 @@ Run the runner, you will see “etdump.etdp” generated. Once you’ve collected debug artifacts ETDump (and optionally an ETRecord), you can use the Inspector API to view performance information. ```python -from executorch.sdk import Inspector +from executorch.devtools import Inspector inspector = Inspector(etdump_path="etdump.etdp") # If you also generated an ETRecord, then pass that in as well: `inspector = Inspector(etdump_path="etdump.etdp", etrecord="etrecord.bin")` diff --git a/docs/source/native-delegates-executorch-xnnpack-delegate.md b/docs/source/native-delegates-executorch-xnnpack-delegate.md index 1d12daef9d8..41de29687e3 100644 --- a/docs/source/native-delegates-executorch-xnnpack-delegate.md +++ b/docs/source/native-delegates-executorch-xnnpack-delegate.md @@ -74,7 +74,7 @@ Since weight packing creates an extra copy of the weights inside XNNPACK, We fre When executing the XNNPACK subgraphs, we prepare the tensor inputs and outputs and feed them to the XNNPACK runtime graph. After executing the runtime graph, the output pointers are filled with the computed tensors. #### **Profiling** -We have enabled basic profiling for XNNPACK delegate that can be enabled with the following compiler flag `-DENABLE_XNNPACK_PROFILING`. With ExecuTorch's SDK integration, you can also now use the SDK tools to profile the model. You can follow the steps in [Using the ExecuTorch SDK to Profile a Model](./tutorials/sdk-integration-tutorial) on how to profile ExecuTorch models and use SDK's Inspector API to view XNNPACK's internal profiling information. +We have enabled basic profiling for XNNPACK delegate that can be enabled with the following compiler flag `-DENABLE_XNNPACK_PROFILING`. With ExecuTorch's Developer Tools integration, you can also now use the Developer Tools to profile the model. You can follow the steps in [Using the ExecuTorch Developer Tools to Profile a Model](./tutorials/devtools-integration-tutorial) on how to profile ExecuTorch models and use Developer Tools' Inspector API to view XNNPACK's internal profiling information. [comment]: <> (TODO: Refactor quantizer to a more official quantization doc) @@ -110,9 +110,9 @@ quantizer.set_global(quantization_config) ### Quantizing your model with the XNNPACKQuantizer After configuring our quantizer, we are now ready to quantize our model ```python -from torch._export import capture_pre_autograd_graph +from torch.export import export_for_training -exported_model = capture_pre_autograd_graph(model_to_quantize, example_inputs) +exported_model = export_for_training(model_to_quantize, example_inputs).module() prepared_model = prepare_pt2e(exported_model, quantizer) print(prepared_model.graph) ``` diff --git a/docs/source/runtime-overview.md b/docs/source/runtime-overview.md index 7bc8b4dd8b4..6766e678e0e 100644 --- a/docs/source/runtime-overview.md +++ b/docs/source/runtime-overview.md @@ -96,7 +96,7 @@ can build it for a wide variety of target systems. #### C++ Language Considerations -* The code is C++11-compatible to work with older toolchains. +* The code is C++17-compatible to work with older toolchains. * The runtime does not use exceptions or RTTI, although it is not antagonistic to them. * The code is compatible with GCC and Clang, and has also been built with diff --git a/docs/source/sdk-bundled-io.md b/docs/source/sdk-bundled-io.md index 33deae3904b..776c37a5da3 100644 --- a/docs/source/sdk-bundled-io.md +++ b/docs/source/sdk-bundled-io.md @@ -28,7 +28,7 @@ In `BundledProgram`, we create two new classes, `MethodTestCase` and `MethodTest :::{dropdown} `MethodTestCase` ```{eval-rst} -.. autofunction:: executorch.sdk.bundled_program.config.MethodTestCase.__init__ +.. autofunction:: executorch.devtools.bundled_program.config.MethodTestCase.__init__ :noindex: ``` ::: @@ -38,7 +38,7 @@ In `BundledProgram`, we create two new classes, `MethodTestCase` and `MethodTest :::{dropdown} `MethodTestSuite` ```{eval-rst} -.. autofunction:: executorch.sdk.bundled_program.config.MethodTestSuite +.. autofunction:: executorch.devtools.bundled_program.config.MethodTestSuite :noindex: ``` ::: @@ -48,13 +48,13 @@ Since each model may have multiple inference methods, we need to generate `List[ ### Step 3: Generate `BundledProgram` -We provide `BundledProgram` class under `executorch/sdk/bundled_program/core.py` to bundled the `ExecutorchProgram`-like variable, including +We provide `BundledProgram` class under `executorch/devtools/bundled_program/core.py` to bundled the `ExecutorchProgram`-like variable, including `ExecutorchProgram`, `MultiMethodExecutorchProgram` or `ExecutorchProgramManager`, with the `List[MethodTestSuite]`: :::{dropdown} `BundledProgram` ```{eval-rst} -.. autofunction:: executorch.sdk.bundled_program.core.BundledProgram.__init__ +.. autofunction:: executorch.devtools.bundled_program.core.BundledProgram.__init__ :noindex: ``` ::: @@ -65,18 +65,18 @@ Construtor of `BundledProgram `will do sannity check internally to see if the gi ### Step 4: Serialize `BundledProgram` to Flatbuffer. -To serialize `BundledProgram` to make runtime APIs use it, we provide two APIs, both under `executorch/sdk/bundled_program/serialize/__init__.py`. +To serialize `BundledProgram` to make runtime APIs use it, we provide two APIs, both under `executorch/devtools/bundled_program/serialize/__init__.py`. :::{dropdown} Serialize and Deserialize ```{eval-rst} -.. currentmodule:: executorch.sdk.bundled_program.serialize +.. currentmodule:: executorch.devtools.bundled_program.serialize .. autofunction:: serialize_from_bundled_program_to_flatbuffer :noindex: ``` ```{eval-rst} -.. currentmodule:: executorch.sdk.bundled_program.serialize +.. currentmodule:: executorch.devtools.bundled_program.serialize .. autofunction:: deserialize_from_flatbuffer_to_bundled_program :noindex: ``` @@ -90,14 +90,13 @@ Here is a flow highlighting how to generate a `BundledProgram` given a PyTorch m import torch from executorch.exir import to_edge -from executorch.sdk import BundledProgram +from executorch.devtools import BundledProgram -from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite -from executorch.sdk.bundled_program.serialize import ( +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.serialize import ( serialize_from_bundled_program_to_flatbuffer, ) -from torch._export import capture_pre_autograd_graph -from torch.export import export +from torch.export import export, export_for_training # Step 1: ExecuTorch Program Export @@ -131,7 +130,7 @@ capture_input = ( # Export method's FX Graph. method_graph = export( - capture_pre_autograd_graph(model, capture_input), + export_for_training(model, capture_input).module(), capture_input, ) @@ -187,7 +186,7 @@ with open(save_path, "wb") as f: We can also regenerate `BundledProgram` from flatbuffer file if needed: ```python -from executorch.sdk.bundled_program.serialize import deserialize_from_flatbuffer_to_bundled_program +from executorch.devtools.bundled_program.serialize import deserialize_from_flatbuffer_to_bundled_program save_path = "bundled_program.bpte" with open(save_path, "rb") as f: serialized_bundled_program = f.read() @@ -211,21 +210,19 @@ We need the pointer to ExecuTorch program to do the execution. To unify the proc Here's an example of how to use the `GetProgramData` API: ```c++ -std::shared_ptr buff_ptr; -size_t buff_len; - -// FILE_PATH here can be either BundledProgram or Program flatbuffer file. -Error status = torch::executor::util::read_file_content( - FILE_PATH, &buff_ptr, &buff_len); -ET_CHECK_MSG( - status == Error::Ok, - "read_file_content() failed with status 0x%" PRIx32, - status); - +// Assume that the user has read the contents of the file into file_data using +// whatever method works best for their application. The file could contain +// either BundledProgram data or Program data. +void* file_data = ...; +size_t file_data_len = ...; + +// If file_data contains a BundledProgram, GetProgramData() will return a +// pointer to the Program data embedded inside it. Otherwise it will return +// file_data, which already pointed to Program data. const void* program_ptr; size_t program_len; status = torch::executor::bundled_program::GetProgramData( - buff_ptr.get(), buff_len, &program_ptr, &program_len); + file_data, file_data_len, &program_ptr, &program_len); ET_CHECK_MSG( status == Error::Ok, "GetProgramData() failed with status 0x%" PRIx32, @@ -255,7 +252,7 @@ We call `torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput ### Runtime Example -Here we provide an example about how to run the bundled program step by step. Most of the code is borrowed from [executor_runner](https://github.com/pytorch/executorch/blob/main/examples/sdk/sdk_example_runner/sdk_example_runner.cpp), and please review that file if you need more info and context: +Here we provide an example about how to run the bundled program step by step. Most of the code is borrowed from [executor_runner](https://github.com/pytorch/executorch/blob/main/examples/devtools/example_runner/example_runner.cpp), and please review that file if you need more info and context: ```c++ // method_name is the name for the method we want to test @@ -313,9 +310,9 @@ Here's the example of the dtype of test input not meet model's requirement: import torch from executorch.exir import to_edge -from executorch.sdk import BundledProgram +from executorch.devtools import BundledProgram -from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite from torch.export import export @@ -340,7 +337,7 @@ inputs = (torch.ones(2, 2, dtype=torch.float), ) # Find each method of model needs to be traced my its name, export its FX Graph. method_graph = export( - capture_pre_autograd_graph(model, inputs), + export_for_training(model, inputs).module(), inputs, ) @@ -400,7 +397,7 @@ Cell In[1], line 72 68 ] 70 # Step 3: Generate BundledProgram ---> 72 bundled_program = create_bundled_program(program, method_test_suites) -File /executorch/sdk/bundled_program/core.py:276, in create_bundled_program(program, method_test_suites) +File /executorch/devtools/bundled_program/core.py:276, in create_bundled_program(program, method_test_suites) 264 """Create bp_schema.BundledProgram by bundling the given program and method_test_suites together. 265 266 Args: @@ -411,7 +408,7 @@ File /executorch/sdk/bundled_program/core.py:276, in create_bundled_program(prog --> 276 assert_valid_bundle(program, method_test_suites) 278 bundled_method_test_suites: List[bp_schema.BundledMethodTestSuite] = [] 280 # Emit data and metadata of bundled tensor -File /executorch/sdk/bundled_program/core.py:219, in assert_valid_bundle(program, method_test_suites) +File /executorch/devtools/bundled_program/core.py:219, in assert_valid_bundle(program, method_test_suites) 215 # type of tensor input should match execution plan 216 if type(cur_plan_test_inputs[j]) == torch.Tensor: 217 # pyre-fixme[16]: Undefined attribute [16]: Item `bool` of `typing.Union[bool, float, int, torch._tensor.Tensor]` @@ -449,9 +446,9 @@ Another common error would be the method name in any `MethodTestSuite` does not import torch from executorch.exir import to_edge -from executorch.sdk import BundledProgram +from executorch.devtools import BundledProgram -from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite from torch.export import export @@ -476,7 +473,7 @@ inputs = (torch.ones(2, 2, dtype=torch.float),) # Find each method of model needs to be traced my its name, export its FX Graph. method_graph = export( - capture_pre_autograd_graph(model, inputs), + export_for_training(model, inputs).module(), inputs, ) @@ -532,7 +529,7 @@ Cell In[3], line 73 70 method_test_suites[0].method_name = "MISSING_METHOD_NAME" 72 # Generate BundledProgram ---> 73 bundled_program = create_bundled_program(program, method_test_suites) -File /executorch/sdk/bundled_program/core.py:276, in create_bundled_program(program, method_test_suites) +File /executorch/devtools/bundled_program/core.py:276, in create_bundled_program(program, method_test_suites) 264 """Create bp_schema.BundledProgram by bundling the given program and method_test_suites together. 265 266 Args: @@ -543,7 +540,7 @@ File /executorch/sdk/bundled_program/core.py:276, in create_bundled_program(prog --> 276 assert_valid_bundle(program, method_test_suites) 278 bundled_method_test_suites: List[bp_schema.BundledMethodTestSuite] = [] 280 # Emit data and metadata of bundled tensor -File /executorch/sdk/bundled_program/core.py:141, in assert_valid_bundle(program, method_test_suites) +File /executorch/devtools/bundled_program/core.py:141, in assert_valid_bundle(program, method_test_suites) 138 method_name_of_program = {e.name for e in program.execution_plan} 139 method_name_of_test_suites = {t.method_name for t in method_test_suites} --> 141 assert method_name_of_test_suites.issubset( diff --git a/docs/source/sdk-debugging.md b/docs/source/sdk-debugging.md index 45e50b44e87..4707b4a2f99 100644 --- a/docs/source/sdk-debugging.md +++ b/docs/source/sdk-debugging.md @@ -1,6 +1,6 @@ # Debugging Models in ExecuTorch -With the ExecuTorch SDK, users can debug their models for numerical inaccurcies and extract model outputs from their device to do quality analysis (such as Signal-to-Noise, Mean square error etc.). +With the ExecuTorch Developer Tools, users can debug their models for numerical inaccurcies and extract model outputs from their device to do quality analysis (such as Signal-to-Noise, Mean square error etc.). Currently, ExecuTorch supports the following debugging flows: - Extraction of model level outputs via ETDump. @@ -11,7 +11,7 @@ Currently, ExecuTorch supports the following debugging flows: ## Steps to debug a model in ExecuTorch ### Runtime -For a real example reflecting the steps below, please refer to [sdk_example_runner.cpp](https://github.com/pytorch/executorch/blob/main/examples/sdk/sdk_example_runner/sdk_example_runner.cpp). +For a real example reflecting the steps below, please refer to [example_runner.cpp](https://github.com/pytorch/executorch/blob/main/examples/devtools/example_runner/example_runner.cpp). 1. [Optional] Generate an [ETRecord](./sdk-etrecord.rst) while exporting your model. When provided, this enables users to link profiling information back to the eager model source code (with stack traces and module hierarchy). 2. Integrate [ETDump generation](./sdk-etdump.md) into the runtime and set the debugging level by configuring the `ETDumpGen` object. Then, provide an additional buffer to which intermediate outputs and program outputs will be written. Currently we support two levels of debugging: @@ -38,7 +38,7 @@ For a real example reflecting the steps below, please refer to [sdk_example_runn Once a model has been run, using the generated ETDump and debug buffers, users can leverage the [Inspector API's](./sdk-inspector.rst) to inspect these debug outputs. ```python -from executorch.sdk import Inspector +from executorch.devtools import Inspector # Create an Inspector instance with etdump and the debug buffer. inspector = Inspector(etdump_path=etdump_path, @@ -67,7 +67,7 @@ We've also provided a simple set of utilities that let users perform quality ana ```python -from executorch.sdk.inspector._inspector_utils import compare_results +from executorch.devtools.inspector import compare_results # Run a simple quality analysis between the model outputs sourced from the # runtime and a set of reference outputs. diff --git a/docs/source/sdk-delegate-integration.md b/docs/source/sdk-delegate-integration.md index 80033711552..a2f67157c89 100644 --- a/docs/source/sdk-delegate-integration.md +++ b/docs/source/sdk-delegate-integration.md @@ -1,4 +1,4 @@ -# SDK Delegate Integration +# Developer Tools Delegate Integration [Delegate backends](compiler-delegate-and-partitioner.md) are a prominent component of on-device models due to their flexibility in defining behavior. A side effect of this flexibility is that it operates as an opaque transformation. This obfuscates rich associations and mutations that are valuable in post-processing. - For example, if two different operator fusions were to occur within a delegate, post processing wouldn’t be able to separate the two transformations. diff --git a/docs/source/sdk-etdump.md b/docs/source/sdk-etdump.md index 4eacb18b14c..c58efb40de7 100644 --- a/docs/source/sdk-etdump.md +++ b/docs/source/sdk-etdump.md @@ -1,6 +1,6 @@ # Prerequisite | ETDump - ExecuTorch Dump -ETDump (ExecuTorch Dump) is one of the core components of the ExecuTorch SDK experience. It is the mechanism through which all forms of profiling and debugging data is extracted from the runtime. Users can't parse ETDump directly; instead, they should pass it into the Inspector API, which deserializes the data, offering interfaces for flexible analysis and debugging. +ETDump (ExecuTorch Dump) is one of the core components of the ExecuTorch Developer Tools. It is the mechanism through which all forms of profiling and debugging data is extracted from the runtime. Users can't parse ETDump directly; instead, they should pass it into the Inspector API, which deserializes the data, offering interfaces for flexible analysis and debugging. ## Generating an ETDump @@ -9,7 +9,7 @@ Generating an ETDump is a relatively straightforward process. Users can follow t 1. ***Include*** the ETDump header in your code. ```C++ -#include +#include ``` 2. ***Create*** an Instance of the ETDumpGen class and pass it into the `load_method` call that is invoked in the runtime. diff --git a/docs/source/sdk-etrecord.rst b/docs/source/sdk-etrecord.rst index 43ed5095c64..63546f43ca6 100644 --- a/docs/source/sdk-etrecord.rst +++ b/docs/source/sdk-etrecord.rst @@ -9,7 +9,7 @@ users ahead of time (when they export their model to run on ExecuTorch). To draw a rough equivalent to conventional software development, ``ETRecord`` can be considered as the binary built with debug symbols that is used for debugging in GNU Debugger (gdb). It is expected that -the user will supply this to the ExecuTorch SDK tooling in order for +the user will supply this to the ExecuTorch Developer Tools in order for them to debug and visualize their model. ``ETRecord`` contains numerous components such as: @@ -31,7 +31,7 @@ they are interested in working with via our tooling. .. warning:: Users should do a deepcopy of the output of ``to_edge()`` and pass in the deepcopy to the ``generate_etrecord`` API. This is needed because the subsequent call, ``to_executorch()``, does an in-place mutation and will lose debug data in the process. -.. currentmodule:: executorch.sdk.etrecord._etrecord +.. currentmodule:: executorch.devtools.etrecord._etrecord .. autofunction:: generate_etrecord Using an ``ETRecord`` diff --git a/docs/source/sdk-inspector.rst b/docs/source/sdk-inspector.rst index e15c1f2a395..4f55271b3fe 100644 --- a/docs/source/sdk-inspector.rst +++ b/docs/source/sdk-inspector.rst @@ -17,7 +17,7 @@ APIs: * By accessing the `public attributes <#inspector-attributes>`__ of the ``Inspector``, ``EventBlock``, and ``Event`` classes. * By using a `CLI <#cli>`__ tool for basic functionalities. -Please refer to the `e2e use case doc `__ get an understanding of how to use these in a real world example. +Please refer to the `e2e use case doc `__ get an understanding of how to use these in a real world example. Inspector Methods @@ -26,26 +26,26 @@ Inspector Methods Constructor ~~~~~~~~~~~ -.. autofunction:: executorch.sdk.Inspector.__init__ +.. autofunction:: executorch.devtools.Inspector.__init__ **Example Usage:** .. code:: python - from executorch.sdk import Inspector + from executorch.devtools import Inspector inspector = Inspector(etdump_path="/path/to/etdump.etdp", etrecord="/path/to/etrecord.bin") to_dataframe ~~~~~~~~~~~~~~~~ -.. autofunction:: executorch.sdk.Inspector.to_dataframe +.. autofunction:: executorch.devtools.Inspector.to_dataframe print_data_tabular ~~~~~~~~~~~~~~~~~~ -.. autofunction:: executorch.sdk.Inspector.print_data_tabular +.. autofunction:: executorch.devtools.Inspector.print_data_tabular .. _example-usage-1: @@ -62,7 +62,7 @@ Note that the unit of delegate profiling events is "cycles". We're working on pr find_total_for_module ~~~~~~~~~~~~~~~~~~~~~ -.. autofunction:: executorch.sdk.Inspector.find_total_for_module +.. autofunction:: executorch.devtools.Inspector.find_total_for_module .. _example-usage-2: @@ -80,7 +80,7 @@ find_total_for_module get_exported_program ~~~~~~~~~~~~~~~~~~~~ -.. autofunction:: executorch.sdk.Inspector.get_exported_program +.. autofunction:: executorch.devtools.Inspector.get_exported_program .. _example-usage-3: @@ -119,7 +119,7 @@ of an ``Inspector`` instance, for example: inspector.event_blocks -.. autoclass:: executorch.sdk.inspector.EventBlock +.. autoclass:: executorch.devtools.inspector.EventBlock ``Event`` Class ~~~~~~~~~~~~~~~ @@ -127,7 +127,7 @@ of an ``Inspector`` instance, for example: Access ``Event`` instances through the ``events`` attribute of an ``EventBlock`` instance. -.. autoclass:: executorch.sdk.inspector.Event +.. autoclass:: executorch.devtools.inspector.Event **Example Usage:** @@ -152,7 +152,7 @@ table. This command produces the identical table output as calling the .. code:: bash - python3 -m sdk.inspector.inspector_cli --etdump_path --etrecord_path + python3 -m devtools.inspector.inspector_cli --etdump_path --etrecord_path Note that the `etrecord_path` argument is optional. diff --git a/docs/source/sdk-overview.md b/docs/source/sdk-overview.md index 53f7d88613a..1e8f1fae1ba 100644 --- a/docs/source/sdk-overview.md +++ b/docs/source/sdk-overview.md @@ -1,44 +1,3 @@ -# Introduction to the ExecuTorch SDK +# Introduction to the ExecuTorch Developer Tools -ExecuTorch has been designed with [productivity](./intro-overview.md) as one of its core objectives and the ExecuTorch SDK enables this through the comprehensive suite of tools it provides users to help them profile, debug, and visualize models that they have onboarded onto ExecuTorch. - -All the components of the SDK have been designed from the ground up with deep integration in both the export process and the runtime. This enables us to provide unique features such as linking back operator execution in the runtime to the line of code in the original eager model that this operator originated from. - -## SDK Features - -The ExecuTorch SDK supports the following features: - -- **BundledProgram** is a utility tool for exporting the model bundled with a sample set of (representative) inputs and expected outputs, so that during runtime users can validate that the actual output is in fact the same as the expected output. -- **Profiling** models with operator level breakdown of performance stats - - Linking back operator performance stats to source code and module hierarchy - - Model loading and execution time -- **Delegate Integration** - Surfacing performance details from delegate backends - - Link back delegate operator execution to the nodes they represent in the edge dialect graph (and subsequently linking back to source code and module hierarchy) -- **Debugging** - Intermediate outputs and output quality analysis -- **Visualization** - Coming soon - -## Fundamental components of the SDK - -In order to fully understand and leverage the power of the SDK in this section, the fundamental components that power the SDK will be detailed. - -### ETRecord -ETRecord (ExecuTorch Record) is an artifact generated during the export process that stores the graphs and other metadata that is critical for the SDK tooling to be able to link back the performance/debug data sourced from the runtime to the source code of the eager model. - -To draw a rough equivalence to conventional software development ETRecord can be considered as the binary built with debug symbols that is used for debugging in GNU Project debugger (gdb). - -More details are available in the [ETRecord documentation](sdk-etrecord.rst) on how to generate and store an ETRecord. - -### ETDump -ETDump (ExecuTorch Dump) is the binary blob that is generated by the runtime after running a model. Similarly as above, to draw a rough equivalence to conventional software development, ETDump can be considered as the coredump of ExecuTorch, but in this case within ETDump we store all the performance and debug data that was generated by the runtime during model execution. - -```{note} -If you only care about looking at the raw performance data without linking back to source code and other extensive features, an ETDump alone will be enough to leverage the basic features of the SDK. For the full experience, it is recommended that the users also generate an ETRecord. -``` - -More details are available in the [ETDump documentation](sdk-etdump.md) on how to generate and store an ETDump from the runtime. - - -### Inspector APIs -The Inspector Python APIs are the main user enrty point into the SDK. They join the data sourced from ETDump and ETRecord to give users access to all the performance and debug data sourced from the runtime along with linkage back to eager model source code and module hierarchy in an easy to use API. - -More details are available in the [Inspector API documentation](sdk-inspector.rst) on how to use the Inspector APIs. +Please update your link to . This URL will be deleted after v0.4.0. diff --git a/docs/source/sdk-profiling.md b/docs/source/sdk-profiling.md index 83276d8d180..e17fb1ae48e 100644 --- a/docs/source/sdk-profiling.md +++ b/docs/source/sdk-profiling.md @@ -4,7 +4,7 @@ Profiling in ExecuTorch gives users access to these runtime metrics: - Model Load Time. - Operator Level Execution Time. - Delegate Execution Time. - - If the delegate that the user is calling into has been integrated with the [SDK](./sdk-delegate-integration.md), then users will also be able to access delegated operator execution time. + - If the delegate that the user is calling into has been integrated with the [Developer Tools](./sdk-delegate-integration.md), then users will also be able to access delegated operator execution time. - End-to-end Inference Execution Time. One uniqe aspect of ExecuTorch Profiling is the ability to link every runtime executed operator back to the exact line of python code from which this operator originated. This capability enables users to easily identify hotspots in their model, source them back to the exact line of Python code, and optimize if chosen to. @@ -20,4 +20,4 @@ We provide access to all the profiling data via the Python [Inspector API](./sdk - Through the Inspector API, users can do a wide range of analysis varying from printing out performance details to doing more finer granular calculation on module level. -Please refer to the [SDK tutorial](./tutorials/sdk-integration-tutorial.rst) for a step-by-step walkthrough of the above process on a sample model. +Please refer to the [Developer Tools tutorial](./tutorials/devtools-integration-tutorial.rst) for a step-by-step walkthrough of the above process on a sample model. diff --git a/docs/source/sdk-tutorial.md b/docs/source/sdk-tutorial.md index 90c9ed6d343..457d3b47ebf 100644 --- a/docs/source/sdk-tutorial.md +++ b/docs/source/sdk-tutorial.md @@ -1,3 +1,3 @@ -## SDK usage tutorial +## Developer Tools Usage Tutorial -Please refer to the [SDK tutorial](./tutorials/sdk-integration-tutorial) for a walkthrough on how to profile a model in ExecuTorch using the SDK. +Please update your link to . This URL will be deleted after v0.4.0. diff --git a/docs/source/tutorial-xnnpack-delegate-lowering.md b/docs/source/tutorial-xnnpack-delegate-lowering.md index 4491a6e8c80..666ee23aa35 100644 --- a/docs/source/tutorial-xnnpack-delegate-lowering.md +++ b/docs/source/tutorial-xnnpack-delegate-lowering.md @@ -74,13 +74,13 @@ After lowering to the XNNPACK Program, we can then prepare it for executorch and The XNNPACK delegate can also execute symmetrically quantized models. To understand the quantization flow and learn how to quantize models, refer to [Custom Quantization](quantization-custom-quantization.md) note. For the sake of this tutorial, we will leverage the `quantize()` python helper function conveniently added to the `executorch/executorch/examples` folder. ```python -from torch._export import capture_pre_autograd_graph +from torch.export import export_for_training from executorch.exir import EdgeCompileConfig mobilenet_v2 = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).eval() sample_inputs = (torch.randn(1, 3, 224, 224), ) -mobilenet_v2 = capture_pre_autograd_graph(mobilenet_v2, sample_inputs) # 2-stage export for quantization path +mobilenet_v2 = export_for_training(mobilenet_v2, sample_inputs).module() # 2-stage export for quantization path from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e from torch.ao.quantization.quantizer.xnnpack_quantizer import ( @@ -107,7 +107,7 @@ def quantize(model, example_inputs): quantized_mobilenetv2 = quantize(mobilenet_v2, sample_inputs) ``` -Quantization requires a two stage export. First we use the `capture_pre_autograd_graph` API to capture the model before giving it to `quantize` utility function. After performing the quantization step, we can now leverage the XNNPACK delegate to lower the quantized exported model graph. From here, the procedure is the same as for the non-quantized model lowering to XNNPACK. +Quantization requires a two stage export. First we use the `export_for_training` API to capture the model before giving it to `quantize` utility function. After performing the quantization step, we can now leverage the XNNPACK delegate to lower the quantized exported model graph. From here, the procedure is the same as for the non-quantized model lowering to XNNPACK. ```python # Continued from earlier... @@ -149,9 +149,10 @@ mkdir cmake-out cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_ENABLE_LOGGING=ON \ -DPYTHON_EXECUTABLE=python \ -Bcmake-out . diff --git a/docs/source/tutorials_source/devtools-integration-tutorial.py b/docs/source/tutorials_source/devtools-integration-tutorial.py new file mode 100644 index 00000000000..b5e335b43d1 --- /dev/null +++ b/docs/source/tutorials_source/devtools-integration-tutorial.py @@ -0,0 +1,301 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Using the ExecuTorch Developer Tools to Profile a Model +======================== + +**Author:** `Jack Khuu `__ +""" + +###################################################################### +# The `ExecuTorch Developer Tools <../devtools-overview.html>`__ is a set of tools designed to +# provide users with the ability to profile, debug, and visualize ExecuTorch +# models. +# +# This tutorial will show a full end-to-end flow of how to utilize the Developer Tools to profile a model. +# Specifically, it will: +# +# 1. Generate the artifacts consumed by the Developer Tools (`ETRecord <../sdk-etrecord.html>`__, `ETDump <../sdk-etdump.html>`__). +# 2. Create an Inspector class consuming these artifacts. +# 3. Utilize the Inspector class to analyze the model profiling result. + +###################################################################### +# Prerequisites +# ------------- +# +# To run this tutorial, you’ll first need to +# `Set up your ExecuTorch environment <../getting-started-setup.html>`__. +# + +###################################################################### +# Generate ETRecord (Optional) +# ---------------------------- +# +# The first step is to generate an ``ETRecord``. ``ETRecord`` contains model +# graphs and metadata for linking runtime results (such as profiling) to +# the eager model. This is generated via ``executorch.devtools.generate_etrecord``. +# +# ``executorch.devtools.generate_etrecord`` takes in an output file path (str), the +# edge dialect model (``EdgeProgramManager``), the ExecuTorch dialect model +# (``ExecutorchProgramManager``), and an optional dictionary of additional models. +# +# In this tutorial, an example model (shown below) is used to demonstrate. + +import copy + +import torch +import torch.nn as nn +import torch.nn.functional as F +from executorch.devtools import generate_etrecord + +from executorch.exir import ( + EdgeCompileConfig, + EdgeProgramManager, + ExecutorchProgramManager, + to_edge, +) +from torch.export import export, ExportedProgram + + +# Generate Model +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + # 1 input image channel, 6 output channels, 5x5 square convolution + # kernel + self.conv1 = nn.Conv2d(1, 6, 5) + self.conv2 = nn.Conv2d(6, 16, 5) + # an affine operation: y = Wx + b + self.fc1 = nn.Linear(16 * 5 * 5, 120) # 5*5 from image dimension + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + # Max pooling over a (2, 2) window + x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2)) + # If the size is a square, you can specify with a single number + x = F.max_pool2d(F.relu(self.conv2(x)), 2) + x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + + +model = Net() + +aten_model: ExportedProgram = export( + model, + (torch.randn(1, 1, 32, 32),), +) + +edge_program_manager: EdgeProgramManager = to_edge( + aten_model, compile_config=EdgeCompileConfig(_check_ir_validity=True) +) +edge_program_manager_copy = copy.deepcopy(edge_program_manager) +et_program_manager: ExecutorchProgramManager = edge_program_manager.to_executorch() + + +# Generate ETRecord +etrecord_path = "etrecord.bin" +generate_etrecord(etrecord_path, edge_program_manager_copy, et_program_manager) + +# sphinx_gallery_start_ignore +from unittest.mock import patch + +# sphinx_gallery_end_ignore + +###################################################################### +# +# .. warning:: +# Users should do a deepcopy of the output of ``to_edge()`` and pass in the +# deepcopy to the ``generate_etrecord`` API. This is needed because the +# subsequent call, ``to_executorch()``, does an in-place mutation and will +# lose debug data in the process. +# + +###################################################################### +# Generate ETDump +# --------------- +# +# Next step is to generate an ``ETDump``. ``ETDump`` contains runtime results +# from executing a `Bundled Program Model <../sdk-bundled-io.html>`__. +# +# In this tutorial, a `Bundled Program` is created from the example model above. + +import torch +from executorch.devtools import BundledProgram + +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.serialize import ( + serialize_from_bundled_program_to_flatbuffer, +) + +from executorch.exir import to_edge +from torch.export import export + +# Step 1: ExecuTorch Program Export +m_name = "forward" +method_graphs = {m_name: export(model, (torch.randn(1, 1, 32, 32),))} + +# Step 2: Construct Method Test Suites +inputs = [[torch.randn(1, 1, 32, 32)] for _ in range(2)] + +method_test_suites = [ + MethodTestSuite( + method_name=m_name, + test_cases=[ + MethodTestCase(inputs=inp, expected_outputs=getattr(model, m_name)(*inp)) + for inp in inputs + ], + ) +] + +# Step 3: Generate BundledProgram +executorch_program = to_edge(method_graphs).to_executorch() +bundled_program = BundledProgram(executorch_program, method_test_suites) + +# Step 4: Serialize BundledProgram to flatbuffer. +serialized_bundled_program = serialize_from_bundled_program_to_flatbuffer( + bundled_program +) +save_path = "bundled_program.bp" +with open(save_path, "wb") as f: + f.write(serialized_bundled_program) + +###################################################################### +# Use CMake (follow `these instructions <../runtime-build-and-cross-compilation.html#configure-the-cmake-build>`__ to set up cmake) to execute the Bundled Program to generate the ``ETDump``:: +# +# cd executorch +# ./examples/devtools/build_example_runner.sh +# cmake-out/examples/devtools/example_runner --bundled_program_path="bundled_program.bp" + +###################################################################### +# Creating an Inspector +# --------------------- +# +# Final step is to create the ``Inspector`` by passing in the artifact paths. +# Inspector takes the runtime results from ``ETDump`` and correlates them to +# the operators of the Edge Dialect Graph. +# +# Recall: An ``ETRecord`` is not required. If an ``ETRecord`` is not provided, +# the Inspector will show runtime results without operator correlation. +# +# To visualize all runtime events, call Inspector's ``print_data_tabular``. + +from executorch.devtools import Inspector + +# sphinx_gallery_start_ignore +inspector_patch = patch.object(Inspector, "__init__", return_value=None) +inspector_patch_print = patch.object(Inspector, "print_data_tabular", return_value="") +inspector_patch.start() +inspector_patch_print.start() +# sphinx_gallery_end_ignore +etrecord_path = "etrecord.bin" +etdump_path = "etdump.etdp" +inspector = Inspector(etdump_path=etdump_path, etrecord=etrecord_path) +# sphinx_gallery_start_ignore +inspector.event_blocks = [] +# sphinx_gallery_end_ignore +inspector.print_data_tabular() + +# sphinx_gallery_start_ignore +inspector_patch.stop() +inspector_patch_print.stop() +# sphinx_gallery_end_ignore + +###################################################################### +# Analyzing with an Inspector +# --------------------------- +# +# ``Inspector`` provides 2 ways of accessing ingested information: `EventBlocks <../sdk-inspector#eventblock-class>`__ +# and ``DataFrames``. These mediums give users the ability to perform custom +# analysis about their model performance. +# +# Below are examples usages, with both ``EventBlock`` and ``DataFrame`` approaches. + +# Set Up +import pprint as pp + +import pandas as pd + +pd.set_option("display.max_colwidth", None) +pd.set_option("display.max_columns", None) + +###################################################################### +# If a user wants the raw profiling results, they would do something similar to +# finding the raw runtime data of an ``addmm.out`` event. + +for event_block in inspector.event_blocks: + # Via EventBlocks + for event in event_block.events: + if event.name == "native_call_addmm.out": + print(event.name, event.perf_data.raw) + + # Via Dataframe + df = event_block.to_dataframe() + df = df[df.event_name == "native_call_addmm.out"] + print(df[["event_name", "raw"]]) + print() + +###################################################################### +# If a user wants to trace an operator back to their model code, they would do +# something similar to finding the module hierarchy and stack trace of the +# slowest ``convolution.out`` call. + +for event_block in inspector.event_blocks: + # Via EventBlocks + slowest = None + for event in event_block.events: + if event.name == "native_call_convolution.out": + if slowest is None or event.perf_data.p50 > slowest.perf_data.p50: + slowest = event + if slowest is not None: + print(slowest.name) + print() + pp.pprint(slowest.stack_traces) + print() + pp.pprint(slowest.module_hierarchy) + + # Via Dataframe + df = event_block.to_dataframe() + df = df[df.event_name == "native_call_convolution.out"] + if len(df) > 0: + slowest = df.loc[df["p50"].idxmax()] + print(slowest.event_name) + print() + pp.pprint(slowest.stack_traces) + print() + pp.pprint(slowest.module_hierarchy) + +###################################################################### +# If a user wants the total runtime of a module, they can use +# ``find_total_for_module``. + +print(inspector.find_total_for_module("L__self__")) +print(inspector.find_total_for_module("L__self___conv2")) + +###################################################################### +# Note: ``find_total_for_module`` is a special first class method of +# `Inspector <../sdk-inspector.html>`__ + +###################################################################### +# Conclusion +# ---------- +# +# In this tutorial, we learned about the steps required to consume an ExecuTorch +# model with the ExecuTorch Developer Tools. It also showed how to use the Inspector APIs +# to analyze the model run results. +# +# Links Mentioned +# ^^^^^^^^^^^^^^^ +# +# - `ExecuTorch Developer Tools Overview <../devtools-overview.html>`__ +# - `ETRecord <../sdk-etrecord.html>`__ +# - `ETDump <../sdk-etdump.html>`__ +# - `Inspector <../sdk-inspector.html>`__ diff --git a/docs/source/tutorials_source/export-to-executorch-tutorial.py b/docs/source/tutorials_source/export-to-executorch-tutorial.py index 2071567ddd1..fac3eab08e5 100644 --- a/docs/source/tutorials_source/export-to-executorch-tutorial.py +++ b/docs/source/tutorials_source/export-to-executorch-tutorial.py @@ -179,8 +179,8 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: # ----------------------- # # To quantize a model, we first need to capture the graph with -# ``torch._export.capture_pre_autograd_graph``, perform quantization, and then -# call ``torch.export``. ``torch._export.capture_pre_autograd_graph`` returns a +# ``torch.export.export_for_training``, perform quantization, and then +# call ``torch.export``. ``torch.export.export_for_training`` returns a # graph which contains ATen operators which are Autograd safe, meaning they are # safe for eager-mode training, which is needed for quantization. We will call # the graph at this level, the ``Pre-Autograd ATen Dialect`` graph. @@ -193,10 +193,10 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: # will annotate the nodes in the graph with information needed to quantize the # model properly for a specific backend. -from torch._export import capture_pre_autograd_graph +from torch.export import export_for_training example_args = (torch.randn(1, 3, 256, 256),) -pre_autograd_aten_dialect = capture_pre_autograd_graph(SimpleConv(), example_args) +pre_autograd_aten_dialect = export_for_training(SimpleConv(), example_args).module() print("Pre-Autograd ATen Dialect Graph") print(pre_autograd_aten_dialect) @@ -523,9 +523,7 @@ def forward(self, a, x, b): executorch_program: ExecutorchProgramManager = edge_program.to_executorch( ExecutorchBackendConfig( passes=[], # User-defined passes - memory_planning_pass=MemoryPlanningPass( - "greedy" - ), # Default memory planning pass + memory_planning_pass=MemoryPlanningPass(), # Default memory planning pass ) ) @@ -562,8 +560,7 @@ def forward(self, a, x, b): # Here is an example for an entire end-to-end workflow: import torch -from torch._export import capture_pre_autograd_graph -from torch.export import export, ExportedProgram +from torch.export import export, export_for_training, ExportedProgram class M(torch.nn.Module): @@ -577,7 +574,7 @@ def forward(self, x): example_args = (torch.randn(3, 4),) -pre_autograd_aten_dialect = capture_pre_autograd_graph(M(), example_args) +pre_autograd_aten_dialect = export_for_training(M(), example_args).module() # Optionally do quantization: # pre_autograd_aten_dialect = convert_pt2e(prepare_pt2e(pre_autograd_aten_dialect, CustomBackendQuantizer)) aten_dialect: ExportedProgram = export(pre_autograd_aten_dialect, example_args) diff --git a/docs/source/tutorials_source/sdk-integration-tutorial.py b/docs/source/tutorials_source/sdk-integration-tutorial.py index ccc2e480ad0..b9a8009c646 100644 --- a/docs/source/tutorials_source/sdk-integration-tutorial.py +++ b/docs/source/tutorials_source/sdk-integration-tutorial.py @@ -6,295 +6,8 @@ # LICENSE file in the root directory of this source tree. """ -Using the ExecuTorch SDK to Profile a Model +Using the ExecuTorch Developer Tools to Profile a Model ======================== -**Author:** `Jack Khuu `__ +Please update your link to . This URL will be deleted after v0.4.0. """ - -###################################################################### -# The `ExecuTorch SDK <../sdk-overview.html>`__ is a set of tools designed to -# provide users with the ability to profile, debug, and visualize ExecuTorch -# models. -# -# This tutorial will show a full end-to-end flow of how to utilize the SDK. -# Specifically, it will: -# -# 1. Generate the artifacts consumed by the SDK (`ETRecord <../sdk-etrecord.html>`__, `ETDump <../sdk-etdump.html>`__). -# 2. Create an Inspector class consuming these artifacts. -# 3. Utilize the Inspector class to analyze the model. - -###################################################################### -# Prerequisites -# ------------- -# -# To run this tutorial, you’ll first need to -# `Set up your ExecuTorch environment <../getting-started-setup.html>`__. -# - -###################################################################### -# Generate ETRecord (Optional) -# ---------------------------- -# -# The first step is to generate an ``ETRecord``. ``ETRecord`` contains model -# graphs and metadata for linking runtime results (such as profiling) to -# the eager model. This is generated via ``executorch.sdk.generate_etrecord``. -# -# ``executorch.sdk.generate_etrecord`` takes in an output file path (str), the -# edge dialect model (``EdgeProgramManager``), the ExecuTorch dialect model -# (``ExecutorchProgramManager``), and an optional dictionary of additional models. -# -# In this tutorial, an example model (shown below) is used to demonstrate. - -import copy - -import torch -import torch.nn as nn -import torch.nn.functional as F - -from executorch.exir import ( - EdgeCompileConfig, - EdgeProgramManager, - ExecutorchProgramManager, - to_edge, -) -from executorch.sdk import generate_etrecord -from torch.export import export, ExportedProgram - - -# Generate Model -class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - # 1 input image channel, 6 output channels, 5x5 square convolution - # kernel - self.conv1 = nn.Conv2d(1, 6, 5) - self.conv2 = nn.Conv2d(6, 16, 5) - # an affine operation: y = Wx + b - self.fc1 = nn.Linear(16 * 5 * 5, 120) # 5*5 from image dimension - self.fc2 = nn.Linear(120, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - # Max pooling over a (2, 2) window - x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2)) - # If the size is a square, you can specify with a single number - x = F.max_pool2d(F.relu(self.conv2(x)), 2) - x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - - -model = Net() - -aten_model: ExportedProgram = export( - model, - (torch.randn(1, 1, 32, 32),), -) - -edge_program_manager: EdgeProgramManager = to_edge( - aten_model, compile_config=EdgeCompileConfig(_check_ir_validity=True) -) -edge_program_manager_copy = copy.deepcopy(edge_program_manager) -et_program_manager: ExecutorchProgramManager = edge_program_manager.to_executorch() - - -# Generate ETRecord -etrecord_path = "etrecord.bin" -generate_etrecord(etrecord_path, edge_program_manager_copy, et_program_manager) - -# sphinx_gallery_start_ignore -from unittest.mock import patch - -# sphinx_gallery_end_ignore - -###################################################################### -# -# .. warning:: -# Users should do a deepcopy of the output of ``to_edge()`` and pass in the -# deepcopy to the ``generate_etrecord`` API. This is needed because the -# subsequent call, ``to_executorch()``, does an in-place mutation and will -# lose debug data in the process. -# - -###################################################################### -# Generate ETDump -# --------------- -# -# Next step is to generate an ``ETDump``. ``ETDump`` contains runtime results -# from executing a `Bundled Program Model <../sdk-bundled-io.html>`__. -# -# In this tutorial, a `Bundled Program` is created from the example model above. - -import torch - -from executorch.exir import to_edge -from executorch.sdk import BundledProgram - -from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite -from executorch.sdk.bundled_program.serialize import ( - serialize_from_bundled_program_to_flatbuffer, -) -from torch.export import export - -# Step 1: ExecuTorch Program Export -m_name = "forward" -method_graphs = {m_name: export(model, (torch.randn(1, 1, 32, 32),))} - -# Step 2: Construct Method Test Suites -inputs = [[torch.randn(1, 1, 32, 32)] for _ in range(2)] - -method_test_suites = [ - MethodTestSuite( - method_name=m_name, - test_cases=[ - MethodTestCase(inputs=inp, expected_outputs=getattr(model, m_name)(*inp)) - for inp in inputs - ], - ) -] - -# Step 3: Generate BundledProgram -executorch_program = to_edge(method_graphs).to_executorch() -bundled_program = BundledProgram(executorch_program, method_test_suites) - -# Step 4: Serialize BundledProgram to flatbuffer. -serialized_bundled_program = serialize_from_bundled_program_to_flatbuffer( - bundled_program -) -save_path = "bundled_program.bp" -with open(save_path, "wb") as f: - f.write(serialized_bundled_program) - -###################################################################### -# Use CMake (follow `these instructions <../runtime-build-and-cross-compilation.html#configure-the-cmake-build>`__ to set up cmake) to execute the Bundled Program to generate the ``ETDump``:: -# -# cd executorch -# ./examples/sdk/build_sdk_example_runner.sh -# cmake-out/examples/sdk/sdk_example_runner --bundled_program_path="bundled_program.bp" - -###################################################################### -# Creating an Inspector -# --------------------- -# -# Final step is to create the ``Inspector`` by passing in the artifact paths. -# Inspector takes the runtime results from ``ETDump`` and correlates them to -# the operators of the Edge Dialect Graph. -# -# Recall: An ``ETRecord`` is not required. If an ``ETRecord`` is not provided, -# the Inspector will show runtime results without operator correlation. -# -# To visualize all runtime events, call Inspector's ``print_data_tabular``. - -from executorch.sdk import Inspector - -# sphinx_gallery_start_ignore -inspector_patch = patch.object(Inspector, "__init__", return_value=None) -inspector_patch_print = patch.object(Inspector, "print_data_tabular", return_value="") -inspector_patch.start() -inspector_patch_print.start() -# sphinx_gallery_end_ignore -etdump_path = "etdump.etdp" -inspector = Inspector(etdump_path=etdump_path, etrecord=etrecord_path) -# sphinx_gallery_start_ignore -inspector.event_blocks = [] -# sphinx_gallery_end_ignore -inspector.print_data_tabular() - -# sphinx_gallery_start_ignore -inspector_patch.stop() -inspector_patch_print.stop() -# sphinx_gallery_end_ignore - -###################################################################### -# Analyzing with an Inspector -# --------------------------- -# -# ``Inspector`` provides 2 ways of accessing ingested information: `EventBlocks <../sdk-inspector#eventblock-class>`__ -# and ``DataFrames``. These mediums give users the ability to perform custom -# analysis about their model performance. -# -# Below are examples usages, with both ``EventBlock`` and ``DataFrame`` approaches. - -# Set Up -import pprint as pp - -import pandas as pd - -pd.set_option("display.max_colwidth", None) -pd.set_option("display.max_columns", None) - -###################################################################### -# If a user wants the raw profiling results, they would do something similar to -# finding the raw runtime data of an ``addmm.out`` event. - -for event_block in inspector.event_blocks: - # Via EventBlocks - for event in event_block.events: - if event.name == "native_call_addmm.out": - print(event.name, event.perf_data.raw) - - # Via Dataframe - df = event_block.to_dataframe() - df = df[df.event_name == "native_call_addmm.out"] - print(df[["event_name", "raw"]]) - print() - -###################################################################### -# If a user wants to trace an operator back to their model code, they would do -# something similar to finding the module hierarchy and stack trace of the -# slowest ``convolution.out`` call. - -for event_block in inspector.event_blocks: - # Via EventBlocks - slowest = None - for event in event_block.events: - if event.name == "native_call_convolution.out": - if slowest is None or event.perf_data.p50 > slowest.perf_data.p50: - slowest = event - if slowest is not None: - print(slowest.name) - print() - pp.pprint(slowest.stack_traces) - print() - pp.pprint(slowest.module_hierarchy) - - # Via Dataframe - df = event_block.to_dataframe() - df = df[df.event_name == "native_call_convolution.out"] - if len(df) > 0: - slowest = df.loc[df["p50"].idxmax()] - print(slowest.event_name) - print() - pp.pprint(slowest.stack_traces) - print() - pp.pprint(slowest.module_hierarchy) - -###################################################################### -# If a user wants the total runtime of a module, they can use -# ``find_total_for_module``. - -print(inspector.find_total_for_module("L__self__")) -print(inspector.find_total_for_module("L__self___conv2")) - -###################################################################### -# Note: ``find_total_for_module`` is a special first class method of -# `Inspector <../sdk-inspector.html>`__ - -###################################################################### -# Conclusion -# ---------- -# -# In this tutorial, we learned about the steps required to consume an ExecuTorch -# model with the ExecuTorch SDK. It also showed how to use the Inspector APIs -# to analyze the model run results. -# -# Links Mentioned -# ^^^^^^^^^^^^^^^ -# -# - `ExecuTorch SDK <../sdk-overview.html>`__ -# - `ETRecord <../sdk-etrecord.html>`__ -# - `ETDump <../sdk-etdump.html>`__ -# - `Inspector <../sdk-inspector.html>`__ diff --git a/docs/website/docs/tutorials/bundled_program.md b/docs/website/docs/tutorials/bundled_program.md index ac67d6f6285..e477d8e6a61 100644 --- a/docs/website/docs/tutorials/bundled_program.md +++ b/docs/website/docs/tutorials/bundled_program.md @@ -49,19 +49,15 @@ Error GetProgramData( Here's an example of how to use the GetProgramData API: ```c++ - std::shared_ptr buff_ptr; - size_t buff_len; - -// FILE_PATH here can be either BundledProgram or Program flatbuffer file. - Error status = torch::executor::util::read_file_content( - FILE_PATH, &buff_ptr, &buff_len); - ET_CHECK_MSG( - status == Error::Ok, - "read_file_content() failed with status 0x%" PRIx32, - status); - - uint32_t prof_tok = EXECUTORCH_BEGIN_PROF("de-serialize model"); - + // Assume that the user has read the contents of the file into file_data using + // whatever method works best for their application. The file could contain + // either BundledProgram data or Program data. + void* file_data = ...; + size_t file_data_len = ...; + + // If file_data contains a BundledProgram, GetProgramData() will return a + // pointer to the Program data embedded inside it. Otherwise it will return + // file_data, which already pointed to Program data. const void* program_ptr; size_t program_len; status = torch::executor::bundled_program::GetProgramData( @@ -122,14 +118,13 @@ ET_NODISCARD Error VerifyResultWithBundledExpectedOutput( ### Example -Here we provide an example about how to run the bundled program step by step. Most of the code are borrowed from "fbcode/executorch/sdk/fb/runners/executor_runner.cpp" and please review that file if you need more info and context: +Here we provide an example about how to run the bundled program step by step. ```c++ // method_name is the name for the method we want to test // memory_manager is the executor::MemoryManager variable for executor memory allocation. // program is the executorch program. Result method = program->load_method(method_name, &memory_manager); - EXECUTORCH_END_PROF(prof_tok); ET_CHECK_MSG( method.ok(), "load_method() failed with status 0x%" PRIx32, diff --git a/examples/README.md b/examples/README.md index f36e873e843..e3a18cf5a0a 100644 --- a/examples/README.md +++ b/examples/README.md @@ -13,7 +13,7 @@ examples ├── models # Contains a set of popular and representative PyTorch models ├── portable # Contains end-to-end demos for ExecuTorch in portable mode ├── selective_build # Contains demos of selective build for optimizing the binary size of the ExecuTorch runtime -├── sdk # Contains demos of BundledProgram and ETDump +├── devtools # Contains demos of BundledProgram and ETDump ├── demo-apps # Contains demo apps for Android and iOS ├── xnnpack # Contains end-to-end ExecuTorch demos with first-party optimization using XNNPACK ├── apple @@ -35,13 +35,17 @@ A user's journey may commence by exploring the demos located in the [`portable/` [This page](./models/llama2/README.md) demonstrates how to run Llama 2 7B and Llama 3 8B models on mobile via ExecuTorch. We use XNNPACK to accelerate the performance and 4-bit groupwise PTQ quantization to fit the model on Android and iOS mobile phones. +## Demo of Llava1.5 7B + +[This page](./models/llava/README.md) demonstrates how to run [Llava 1.5 7B](https://github.com/haotian-liu/LLaVA) model on mobile via ExecuTorch. We use XNNPACK to accelerate the performance and 4-bit groupwise PTQ quantization to fit the model on Android and iOS mobile phones. + ## Demo of Selective Build To understand how to deploy the ExecuTorch runtime with optimization for binary size, explore the demos available in the [`selective_build/`](./selective_build) directory. These demos are specifically designed to illustrate the [Selective Build](../docs/source/kernel-library-selective_build.md), offering insights into reducing the binary size while maintaining efficiency. -## Demo of ExecuTorch SDK +## Demo of ExecuTorch Developer Tools -You will find demos of [ExecuTorch SDK](./sdk/) in the [`sdk/`](./sdk/) directory. The examples focuses on exporting and executing BundledProgram for ExecuTorch model verification and ETDump for collecting profiling and debug data. +You will find demos of [ExecuTorch Developer Tools](./devtools/) in the [`devtools/`](./devtools/) directory. The examples focuses on exporting and executing BundledProgram for ExecuTorch model verification and ETDump for collecting profiling and debug data. ## Demo Apps diff --git a/examples/apple/coreml/executor_runner/main.mm b/examples/apple/coreml/executor_runner/main.mm index 4cc21ba30a2..405bfb9c6c4 100644 --- a/examples/apple/coreml/executor_runner/main.mm +++ b/examples/apple/coreml/executor_runner/main.mm @@ -13,8 +13,7 @@ #import #import #import -#import -#import +#import #import #import #import @@ -25,8 +24,25 @@ static inline id check_class(id obj, Class cls) { #define SAFE_CAST(Object, Type) ((Type *)check_class(Object, [Type class])) -using namespace torch::executor; -using torch::executor::util::FileDataLoader; +using executorch::etdump::ETDumpGen; +using executorch::etdump::ETDumpResult; +using executorch::extension::FileDataLoader; +using executorch::runtime::DataLoader; +using executorch::runtime::EValue; +using executorch::runtime::Error; +using executorch::runtime::EventTracer; +using executorch::runtime::EventTracerDebugLogLevel; +using executorch::runtime::FreeableBuffer; +using executorch::runtime::HierarchicalAllocator; +using executorch::runtime::MemoryAllocator; +using executorch::runtime::MemoryManager; +using executorch::runtime::Method; +using executorch::runtime::MethodMeta; +using executorch::runtime::Program; +using executorch::runtime::Result; +using executorch::runtime::Span; +using executorch::runtime::TensorInfo; +using torch::executor::CoreMLBackendDelegate; static constexpr size_t kRuntimeMemorySize = 16 * 1024U * 1024U; // 16 MB @@ -295,7 +311,7 @@ bool is_model_analysis_enabled(const Args& args) { } void dump_etdump_gen(ETDumpGen *etdump_gen, const Buffer& debug_buffer, const Args& args) { - etdump_result result = (etdump_gen != nullptr) ? etdump_gen->get_etdump_data() : etdump_result{.buf = nullptr, .size = 0}; + ETDumpResult result = (etdump_gen != nullptr) ? etdump_gen->get_etdump_data() : ETDumpResult{.buf = nullptr, .size = 0}; if (result.size == 0) { return; } @@ -317,7 +333,7 @@ void dump_etdump_gen(ETDumpGen *etdump_gen, const Buffer& debug_buffer, const Ar int main(int argc, char * argv[]) { @autoreleasepool { - runtime_init(); + executorch::runtime::runtime_init(); auto args = parse_command_line_args([[NSProcessInfo processInfo] arguments]); if (args.purge_models_cache) { diff --git a/examples/apple/coreml/scripts/build_executor_runner.sh b/examples/apple/coreml/scripts/build_executor_runner.sh index 16c5dea02a4..89cd45ea6b1 100755 --- a/examples/apple/coreml/scripts/build_executor_runner.sh +++ b/examples/apple/coreml/scripts/build_executor_runner.sh @@ -36,7 +36,7 @@ cmake "$EXECUTORCH_ROOT_PATH" -B"$CMAKE_BUILD_DIR_PATH" \ -DFLATC_EXECUTABLE="$(which flatc)" \ -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \ -DEXECUTORCH_BUILD_XNNPACK=OFF \ --DEXECUTORCH_BUILD_SDK=ON \ +-DEXECUTORCH_BUILD_DEVTOOLS=ON \ -DEXECUTORCH_BUILD_COREML=ON \ -Dprotobuf_BUILD_TESTS=OFF \ -Dprotobuf_BUILD_EXAMPLES=OFF \ @@ -56,7 +56,7 @@ mkdir -p "$EXECUTORCH_INCLUDE_DIR_PATH" find extension \( -name "*.h" -o -name "*.hpp" \) -exec rsync -R '{}' "$EXECUTORCH_INCLUDE_DIR_PATH" \; find runtime \( -name "*.h" -o -name "*.hpp" \) -exec rsync -R '{}' "$EXECUTORCH_INCLUDE_DIR_PATH" \; find util \( -name "*.h" -o -name "*.hpp" \) -exec rsync -R '{}' "$EXECUTORCH_INCLUDE_DIR_PATH" \; -find sdk \( -name "*.h" -o -name "*.hpp" \) -exec rsync -R '{}' "$EXECUTORCH_INCLUDE_DIR_PATH" \; +find devtools \( -name "*.h" -o -name "*.hpp" \) -exec rsync -R '{}' "$EXECUTORCH_INCLUDE_DIR_PATH" \; cp -rf "$COREML_DIR_PATH/runtime/include/" "$INCLUDE_DIR_PATH" # Copy required libraries diff --git a/examples/apple/coreml/scripts/debugger_cli.py b/examples/apple/coreml/scripts/debugger_cli.py index cb978de0746..88390f8d8cb 100644 --- a/examples/apple/coreml/scripts/debugger_cli.py +++ b/examples/apple/coreml/scripts/debugger_cli.py @@ -24,7 +24,7 @@ def get_root_dir_path() -> Path: sys.path.append(str((get_root_dir_path() / "examples").resolve())) from inspector_utils import ( - build_sdk_runner_including_coreml, + build_devtools_runner_including_coreml, ComparisonResult, create_inspector_coreml, create_inspector_reference, @@ -145,7 +145,7 @@ def main() -> None: f"Valid compute units are {valid_compute_units}." ) - build_sdk_runner_including_coreml( + build_devtools_runner_including_coreml( root_dir_path=get_root_dir_path(), conda_env_name=args.conda_environment_name ) diff --git a/examples/apple/coreml/scripts/export.py b/examples/apple/coreml/scripts/export.py index 4bf26a7f3ea..e906c0704cb 100644 --- a/examples/apple/coreml/scripts/export.py +++ b/examples/apple/coreml/scripts/export.py @@ -17,10 +17,10 @@ from executorch.backends.apple.coreml.compiler import CoreMLBackend from executorch.backends.apple.coreml.partition import CoreMLPartitioner +from executorch.devtools.etrecord import generate_etrecord from executorch.exir import to_edge from executorch.exir.backend.backend_api import to_backend -from executorch.sdk.etrecord import generate_etrecord from torch.export import export REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent.parent.parent @@ -104,11 +104,7 @@ def export_lowered_module_to_executorch_program(lowered_module, example_inputs): lowered_module(*example_inputs) exec_prog = to_edge( export(lowered_module, example_inputs), compile_config=_EDGE_COMPILE_CONFIG - ).to_executorch( - config=exir.ExecutorchBackendConfig( - extract_constant_segment=False, extract_delegate_segments=True - ) - ) + ).to_executorch(config=exir.ExecutorchBackendConfig(extract_delegate_segments=True)) return exec_prog @@ -178,9 +174,7 @@ def generate_compile_specs_from_args(args): ) delegated_program_manager = edge_program_manager.to_backend(partitioner) exec_program = delegated_program_manager.to_executorch( - config=exir.ExecutorchBackendConfig( - extract_constant_segment=False, extract_delegate_segments=True - ) + config=exir.ExecutorchBackendConfig(extract_delegate_segments=True) ) else: lowered_module, edge_copy = lower_module_to_coreml( diff --git a/examples/apple/coreml/scripts/inspector_cli.py b/examples/apple/coreml/scripts/inspector_cli.py index 768465f770a..c63d4791fcf 100644 --- a/examples/apple/coreml/scripts/inspector_cli.py +++ b/examples/apple/coreml/scripts/inspector_cli.py @@ -8,8 +8,8 @@ from pathlib import Path -from executorch.sdk import Inspector -from executorch.sdk.inspector._inspector_utils import compare_results +from executorch.devtools import Inspector +from executorch.devtools.inspector import compare_results def get_root_dir_path() -> Path: diff --git a/examples/apple/coreml/scripts/inspector_utils.py b/examples/apple/coreml/scripts/inspector_utils.py index 1736c2cefbf..08af6fb3484 100644 --- a/examples/apple/coreml/scripts/inspector_utils.py +++ b/examples/apple/coreml/scripts/inspector_utils.py @@ -20,6 +20,13 @@ from executorch.backends.apple.coreml.compiler import CoreMLBackend from executorch.backends.apple.coreml.partition import CoreMLPartitioner +from executorch.devtools import BundledProgram, generate_etrecord, Inspector +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.serialize import ( + serialize_from_bundled_program_to_flatbuffer, +) +from executorch.devtools.inspector import Event + from executorch.exir import ( EdgeProgramManager, ExecutorchBackendConfig, @@ -30,14 +37,6 @@ from executorch.exir.backend.compile_spec_schema import CompileSpec from executorch.exir.tracer import Value -from executorch.sdk import BundledProgram, generate_etrecord, Inspector - -from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite -from executorch.sdk.bundled_program.serialize import ( - serialize_from_bundled_program_to_flatbuffer, -) -from executorch.sdk.inspector import Event - from torch.export import export, ExportedProgram COREML_METADATA_KEYS: Final[List[Tuple[str, str]]] = [ @@ -48,26 +47,26 @@ ] -def build_sdk_runner_including_coreml( +def build_devtools_runner_including_coreml( root_dir_path: Path, conda_env_name: str, force: bool = False, ): if not force: - sdk_executable_path = ( - root_dir_path / "cmake-out" / "examples" / "sdk" / "sdk_example_runner" + devtools_executable_path = ( + root_dir_path / "cmake-out" / "examples" / "devtools" / "example_runner" ) - print(sdk_executable_path) - if sdk_executable_path.is_file(): + print(devtools_executable_path) + if devtools_executable_path.is_file(): return cd_root_command: str = f"cd {root_dir_path.resolve()}" conda_activate_env_command: str = f"source conda activate {conda_env_name}" - build_sdk_runner_command: str = ( - "./examples/sdk/build_sdk_example_runner.sh --coreml" + build_devtools_runner_command: str = ( + "./examples/devtools/build_example_runner.sh --coreml" ) build_command: str = ( - f"{cd_root_command} && {conda_activate_env_command} && {build_sdk_runner_command}" + f"{cd_root_command} && {conda_activate_env_command} && {build_devtools_runner_command}" ) subprocess.run( f'bash -c "{build_command}"', shell=True, check=True @@ -80,7 +79,6 @@ def build_sdk_runner_including_coreml( ) _EDGE_BACKEND_CONFIG = exir.ExecutorchBackendConfig( - extract_constant_segment=False, extract_delegate_segments=True, ) @@ -175,22 +173,24 @@ def generate_etdump_with_intermediate_values( debug_buffer_path: Path, debug_buffer_size: int, ): - sdk_executable_path = ( - root_dir_path / "cmake-out" / "examples" / "sdk" / "sdk_example_runner" + devtools_executable_path = ( + root_dir_path / "cmake-out" / "examples" / "devtools" / "example_runner" ) - if not sdk_executable_path.is_file(): + if not devtools_executable_path.is_file(): raise FileNotFoundError( - errno.ENOENT, os.strerror(errno.ENOENT), str(sdk_executable_path.resolve()) + errno.ENOENT, + os.strerror(errno.ENOENT), + str(devtools_executable_path.resolve()), ) - sdk_runner_command: str = f""" - {sdk_executable_path.resolve()} -dump_intermediate_outputs\ + devtools_runner_command: str = f""" + {devtools_executable_path.resolve()} -dump_intermediate_outputs\ -bundled_program_path {bundled_program_path.resolve()}\ -etdump_path {et_dump_path.resolve()}\ -debug_output_path {debug_buffer_path.resolve()}\ -debug_buffer_size {debug_buffer_size}""" subprocess.run( - f'bash -c "{sdk_runner_command}"', shell=True, check=True + f'bash -c "{devtools_runner_command}"', shell=True, check=True ).check_returncode() diff --git a/examples/apple/mps/CMakeLists.txt b/examples/apple/mps/CMakeLists.txt index d1dd8e93d7e..319d8159ced 100644 --- a/examples/apple/mps/CMakeLists.txt +++ b/examples/apple/mps/CMakeLists.txt @@ -92,8 +92,8 @@ if(NOT CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$") include(${EXECUTORCH_SRCS_FILE}) target_include_directories( bundled_program - INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/../../../sdk/include - ${CMAKE_CURRENT_BINARY_DIR}/../../../sdk/bundled_program + INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/../../../devtools/include + ${CMAKE_CURRENT_BINARY_DIR}/../../../devtools/bundled_program ${EXECUTORCH_ROOT}/third-party/flatbuffers/include ${EXECUTORCH_ROOT}/third-party/flatcc/include ${_mps_schema_headers} diff --git a/examples/apple/mps/README.md b/examples/apple/mps/README.md index bebd1329be4..dc01d585f84 100644 --- a/examples/apple/mps/README.md +++ b/examples/apple/mps/README.md @@ -30,7 +30,7 @@ Once we have the model binary file, then let's run it with the ExecuTorch runtim # Build and install executorch cmake -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_SDK=ON \ + -DEXECUTORCH_BUILD_DEVTOOLS=ON \ -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ -DEXECUTORCH_BUILD_MPS=ON \ -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ diff --git a/examples/apple/mps/executor_runner/mps_executor_runner.mm b/examples/apple/mps/executor_runner/mps_executor_runner.mm index 604419a620e..e3d0e2978b6 100644 --- a/examples/apple/mps/executor_runner/mps_executor_runner.mm +++ b/examples/apple/mps/executor_runner/mps_executor_runner.mm @@ -30,8 +30,8 @@ #include #include #include -#include -#include +#include +#include #include using namespace std::chrono; @@ -97,8 +97,26 @@ 262144, // 256 KB "Size of the debug buffer in bytes to allocate for intermediate outputs and program outputs logging."); -using namespace torch::executor; -using torch::executor::util::FileDataLoader; +using executorch::etdump::ETDumpGen; +using executorch::etdump::ETDumpResult; +using executorch::extension::BufferCleanup; +using executorch::extension::BufferDataLoader; +using executorch::extension::FileDataLoader; +using executorch::runtime::DataLoader; +using executorch::runtime::EValue; +using executorch::runtime::Error; +using executorch::runtime::EventTracerDebugLogLevel; +using executorch::runtime::FreeableBuffer; +using executorch::runtime::HierarchicalAllocator; +using executorch::runtime::MemoryAllocator; +using executorch::runtime::MemoryManager; +using executorch::runtime::Method; +using executorch::runtime::MethodMeta; +using executorch::runtime::Program; +using executorch::runtime::Result; +using executorch::runtime::Span; + +namespace bundled_program = executorch::bundled_program; int main(int argc, char** argv) { { @@ -113,7 +131,7 @@ int main(int argc, char** argv) { return 1; } - runtime_init(); + executorch::runtime::runtime_init(); gflags::ParseCommandLineFlags(&argc, &argv, true); if (argc != 1) { @@ -144,20 +162,20 @@ int main(int argc, char** argv) { // Find the offset to the embedded Program. const void* program_data; size_t program_data_len; - Error status = torch::executor::bundled_program::GetProgramData( + Error status = bundled_program::get_program_data( const_cast(file_data->data()), file_data->size(), &program_data, &program_data_len); ET_CHECK_MSG( status == Error::Ok, - "GetProgramData() failed on file '%s': 0x%x", + "get_program_data() failed on file '%s': 0x%x", model_path, (unsigned int)status); // Wrap the buffer in a DataLoader. auto buffer_data_loader = - util::BufferDataLoader(program_data, program_data_len); + BufferDataLoader(program_data, program_data_len); // Parse the program file. This is immutable, and can also be reused between // multiple execution invocations across multiple threads. @@ -239,7 +257,7 @@ HierarchicalAllocator planned_memory( // be used by a single thread at at time, but it can be reused. // - torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen(); + ETDumpGen etdump_gen; Result method = program->load_method(method_name, &memory_manager, &etdump_gen); ET_CHECK_MSG( @@ -263,11 +281,11 @@ HierarchicalAllocator planned_memory( } // Prepare the inputs. - std::unique_ptr inputs; + std::unique_ptr inputs; if (FLAGS_bundled_program) { ET_LOG(Info, "Loading bundled program..."); // Use the inputs embedded in the bundled program. - status = torch::executor::bundled_program::LoadBundledInput( + status = bundled_program::load_bundled_input( *method, file_data->data(), FLAGS_testset_idx); @@ -278,11 +296,11 @@ HierarchicalAllocator planned_memory( } else { ET_LOG(Info, "Loading non-bundled program...\n"); // Use ones-initialized inputs. - auto inputs_result = torch::executor::util::prepare_input_tensors(*method); + auto inputs_result = executorch::extension::prepare_input_tensors(*method); if (inputs_result.ok()) { // Will free the inputs when destroyed. inputs = - std::make_unique(std::move(inputs_result.get())); + std::make_unique(std::move(inputs_result.get())); } } ET_LOG(Info, "Inputs prepared."); @@ -322,14 +340,14 @@ HierarchicalAllocator planned_memory( status = method->get_outputs(outputs.data(), outputs.size()); ET_CHECK(status == Error::Ok); // Print the first and last 100 elements of long lists of scalars. - std::cout << torch::executor::util::evalue_edge_items(100); + std::cout << executorch::extension::evalue_edge_items(100); for (int i = 0; i < outputs.size(); ++i) { std::cout << "Output " << i << ": " << outputs[i] << std::endl; } // Dump the etdump data containing profiling/debugging data to the specified // file. - etdump_result result = etdump_gen.get_etdump_data(); + ETDumpResult result = etdump_gen.get_etdump_data(); if (result.buf != nullptr && result.size > 0) { FILE* f = fopen(FLAGS_etdump_path.c_str(), "w+"); fwrite((uint8_t*)result.buf, 1, result.size, f); @@ -362,7 +380,7 @@ HierarchicalAllocator planned_memory( atol = 1e-01; rtol = 1e-01; } - status = torch::executor::bundled_program::VerifyResultWithBundledExpectedOutput( + status = bundled_program::verify_method_outputs( *method, file_data->data(), FLAGS_testset_idx, diff --git a/examples/apple/mps/executor_runner/targets.bzl b/examples/apple/mps/executor_runner/targets.bzl index fd0a7a50468..14399411ae3 100644 --- a/examples/apple/mps/executor_runner/targets.bzl +++ b/examples/apple/mps/executor_runner/targets.bzl @@ -28,9 +28,9 @@ def define_common_targets(): "//executorch/extension/data_loader:file_data_loader", "//executorch/kernels/portable:generated_lib", "//executorch/extension/data_loader:file_data_loader", - "//executorch/sdk/etdump:etdump_flatcc", + "//executorch/devtools/etdump:etdump_flatcc", "//executorch/extension/data_loader:buffer_data_loader", - "//executorch/sdk/bundled_program:runtime", + "//executorch/devtools/bundled_program:runtime", ], external_deps = [ "gflags", diff --git a/examples/apple/mps/scripts/build_mps_executor_runner.sh b/examples/apple/mps/scripts/build_mps_executor_runner.sh index 16754588b67..31ab54fd4d3 100755 --- a/examples/apple/mps/scripts/build_mps_executor_runner.sh +++ b/examples/apple/mps/scripts/build_mps_executor_runner.sh @@ -41,7 +41,7 @@ rm -rf "$OUTPUT" cmake -DBUCK2="$BUCK" \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE="$MODE" \ - -DEXECUTORCH_BUILD_SDK=ON \ + -DEXECUTORCH_BUILD_DEVTOOLS=ON \ -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ -DEXECUTORCH_BUILD_MPS=ON \ -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ diff --git a/examples/apple/mps/scripts/mps_example.py b/examples/apple/mps/scripts/mps_example.py index e561afb1858..d6416e0ffc8 100644 --- a/examples/apple/mps/scripts/mps_example.py +++ b/examples/apple/mps/scripts/mps_example.py @@ -14,6 +14,11 @@ from executorch import exir from executorch.backends.apple.mps import MPSBackend from executorch.backends.apple.mps.partition import MPSPartitioner +from executorch.devtools import BundledProgram, generate_etrecord +from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite +from executorch.devtools.bundled_program.serialize import ( + serialize_from_bundled_program_to_flatbuffer, +) from executorch.exir import ( EdgeCompileConfig, @@ -24,11 +29,6 @@ from executorch.exir.backend.backend_details import CompileSpec from executorch.exir.capture._config import ExecutorchBackendConfig from executorch.extension.export_util.utils import export_to_edge, save_pte_program -from executorch.sdk import BundledProgram, generate_etrecord -from executorch.sdk.bundled_program.config import MethodTestCase, MethodTestSuite -from executorch.sdk.bundled_program.serialize import ( - serialize_from_bundled_program_to_flatbuffer, -) from ....models import MODEL_NAME_TO_MODEL from ....models.model_factory import EagerModelFactory @@ -183,9 +183,7 @@ def get_model_config(args): logging.info(f"Lowered graph:\n{edge.exported_program().graph}") executorch_program = edge.to_executorch( - config=ExecutorchBackendConfig( - extract_delegate_segments=False, extract_constant_segment=False - ) + config=ExecutorchBackendConfig(extract_delegate_segments=False) ) else: lowered_module = to_backend( @@ -195,11 +193,7 @@ def get_model_config(args): lowered_module, example_inputs, edge_compile_config=exir.EdgeCompileConfig(_check_ir_validity=False), - ).to_executorch( - config=ExecutorchBackendConfig( - extract_delegate_segments=False, extract_constant_segment=False - ) - ) + ).to_executorch(config=ExecutorchBackendConfig(extract_delegate_segments=False)) model_name = f"{args.model_name}_mps" diff --git a/examples/apple/mps/test_mps.sh b/examples/apple/mps/test_mps.sh index 55712089e07..555161dd3f7 100755 --- a/examples/apple/mps/test_mps.sh +++ b/examples/apple/mps/test_mps.sh @@ -11,14 +11,14 @@ set -e # shellcheck source=/dev/null source "$(dirname "${BASH_SOURCE[0]}")/../../../.ci/scripts/utils.sh" -cmake_install_executorch_sdk_lib() { +cmake_install_executorch_devtools_lib() { echo "Installing libexecutorch.a, libportable_kernels.a, libetdump.a, libbundled_program.a" rm -rf cmake-out retry cmake -DBUCK2="$BUCK" \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_SDK=ON \ + -DEXECUTORCH_BUILD_DEVTOOLS=ON \ -DEXECUTORCH_BUILD_MPS=ON \ -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \ -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ @@ -60,5 +60,5 @@ then fi -cmake_install_executorch_sdk_lib +cmake_install_executorch_devtools_lib test_cmake_mps diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py index f854a081fa6..9a45195e58f 100644 --- a/examples/arm/aot_arm_compiler.py +++ b/examples/arm/aot_arm_compiler.py @@ -214,7 +214,11 @@ def forward(self, x): edge = edge.to_backend( ArmPartitioner( ArmCompileSpecBuilder() - .ethosu_compile_spec("ethos-u55-128") + .ethosu_compile_spec( + "ethos-u55-128", + system_config="Ethos_U55_High_End_Embedded", + memory_mode="Shared_Sram", + ) .set_permute_memory_format( args.model_name in MODEL_NAME_TO_MODEL.keys() ) @@ -226,9 +230,7 @@ def forward(self, x): try: exec_prog = edge.to_executorch( - config=ExecutorchBackendConfig( - extract_delegate_segments=False, extract_constant_segment=False - ) + config=ExecutorchBackendConfig(extract_delegate_segments=False) ) except RuntimeError as e: if "Missing out variants" in str(e.args[0]): diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt index 1f42eda9fbc..68c5435dffe 100644 --- a/examples/arm/executor_runner/CMakeLists.txt +++ b/examples/arm/executor_runner/CMakeLists.txt @@ -9,25 +9,38 @@ project(arm_executor_runner) option(SEMIHOSTING "Enable semihosting" OFF) if(NOT DEFINED ET_PTE_FILE_PATH AND NOT ${SEMIHOSTING}) - message(FATAL_ERROR - "ET_PTE_FILE_PATH must specify a model .pte, for bare metal systems the " - "model is built into the binary.") + message( + FATAL_ERROR + "ET_PTE_FILE_PATH must specify a model .pte, for bare metal systems the " + "model is built into the binary." + ) endif() # Example ExecuTorch demo for bare metal Cortex-M based systems -set(ET_DIR_PATH "../../.." CACHE PATH - "Path to ExecuTorch dir") -set(ET_BUILD_DIR_PATH "${ET_DIR_PATH}/cmake-out" CACHE PATH - "Path to ExecuTorch build dir") -set(ET_INCLUDE_PATH "${ET_DIR_PATH}/.." CACHE PATH - "Path to ExecuTorch headers") -set(ET_PTE_FILE_PATH "" CACHE PATH - "Path to ExecuTorch model pte") -set(ETHOS_SDK_PATH "${ET_DIR_PATH}/examples/arm/ethos-u-scratch/ethos-u" CACHE PATH - "Path to Ethos-U bare metal driver/env") -set(PYTHON_EXECUTABLE "python" CACHE PATH - "Define to override python executable used") - +set(ET_DIR_PATH + "../../.." + CACHE PATH "Path to ExecuTorch dir" +) +set(ET_BUILD_DIR_PATH + "${ET_DIR_PATH}/cmake-out" + CACHE PATH "Path to ExecuTorch build dir" +) +set(ET_INCLUDE_PATH + "${ET_DIR_PATH}/.." + CACHE PATH "Path to ExecuTorch headers" +) +set(ET_PTE_FILE_PATH + "" + CACHE PATH "Path to ExecuTorch model pte" +) +set(ETHOS_SDK_PATH + "${ET_DIR_PATH}/examples/arm/ethos-u-scratch/ethos-u" + CACHE PATH "Path to Ethos-U bare metal driver/env" +) +set(PYTHON_EXECUTABLE + "python" + CACHE PATH "Define to override python executable used" +) get_filename_component(ET_BUILD_DIR_PATH ${ET_BUILD_DIR_PATH} REALPATH) get_filename_component(ET_DIR_PATH ${ET_DIR_PATH} REALPATH) @@ -104,23 +117,25 @@ set_property( # Convert pte to header if(NOT ${SEMIHOSTING}) - add_custom_target(gen_model_header - DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/model_pte.h) + add_custom_target( + gen_model_header DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/model_pte.h + ) add_custom_command( - OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/model_pte.h - COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/pte_to_header.py - --pte ${ET_PTE_FILE_PATH} - --outdir ${CMAKE_CURRENT_BINARY_DIR} - DEPENDS ${ET_PTE_FILE_PATH} - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/model_pte.h + COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/pte_to_header.py --pte + ${ET_PTE_FILE_PATH} --outdir ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS ${ET_PTE_FILE_PATH} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} ) endif() # The arm_executor_runner executable add_executable(arm_executor_runner) -target_sources(arm_executor_runner PRIVATE arm_executor_runner.cpp) +target_sources( + arm_executor_runner PRIVATE arm_executor_runner.cpp arm_perf_monitor.cpp +) # Include the target's bare-metal linker script ethosu_eval_link_options(arm_executor_runner) @@ -146,19 +161,17 @@ target_include_directories( arm_executor_runner PRIVATE ${ET_INCLUDE_PATH} ${CMAKE_CURRENT_BINARY_DIR} ) - - if(SEMIHOSTING) -target_compile_definitions(arm_executor_runner PUBLIC SEMIHOSTING) + target_compile_definitions(arm_executor_runner PUBLIC SEMIHOSTING) else() -add_dependencies(arm_executor_runner gen_model_header) + add_dependencies(arm_executor_runner gen_model_header) endif() # Fixup compilation of retarget.c if(SEMIHOSTING) -# Remove this when MLBEDSW-8910 is closed. -set_source_files_properties( - ${ETHOS_SDK_PATH}/core_platform/targets/corstone-300/retarget.c - PROPERTIES HEADER_FILE_ONLY TRUE -) + # Remove this when MLBEDSW-8910 is closed. + set_source_files_properties( + ${ETHOS_SDK_PATH}/core_platform/targets/corstone-300/retarget.c + PROPERTIES HEADER_FILE_ONLY TRUE + ) endif() diff --git a/examples/arm/executor_runner/arm_executor_runner.cpp b/examples/arm/executor_runner/arm_executor_runner.cpp index 6256ff47cf6..9ca3ebcdc7c 100644 --- a/examples/arm/executor_runner/arm_executor_runner.cpp +++ b/examples/arm/executor_runner/arm_executor_runner.cpp @@ -20,36 +20,59 @@ #include #include -/** - * This header file is generated by the build process based on the .pte file - * specified in the ET_PTE_FILE_PATH variable to the cmake build. - * Control of the action of the .pte, it's use of operators and delegates, and - * which are included in the bare metal build are also orchestrated by the - * CMakeLists file. For example use see examples/arm/run.sh - */ +#include "arm_perf_monitor.h" + #ifdef SEMIHOSTING -// TODO: Verify the section attribute to match the linker script -// pending MLETORCH-39 -const size_t input_allocation_pool_size = 1 * 1024 * 1024; +// In our unit test flow, we have the capability to provide an enitre model to +// the Corstone-3xx FVP using semi hosting. Hence, the input allocation pool +// needs to be large enough to take an entire model. On the FVP, +// network_model_sec is linked to the DDR, which is large (256MB on +// Corstone-300). +const size_t input_allocation_pool_size = 100 * 1024 * 1024; unsigned char __attribute__(( section("network_model_sec"), aligned(16))) input_allocation_pool[input_allocation_pool_size]; // memory for the model will be allocated from the input_allocation_pool char* model_pte = nullptr; #else +/** + * This header file is generated by the build process based on the .pte file + * specified in the ET_PTE_FILE_PATH variable to the cmake build. + * Control of the action of the .pte, it's use of operators and delegates, and + * which are included in the bare metal build are also orchestrated by the + * CMakeLists file. For example use see examples/arm/run.sh + */ #include "model_pte.h" #endif -using namespace exec_aten; -using namespace std; -using torch::executor::Error; -using torch::executor::Result; +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using executorch::aten::TensorImpl; +using executorch::extension::BufferCleanup; +using executorch::extension::BufferDataLoader; +using executorch::runtime::Error; +using executorch::runtime::EValue; +using executorch::runtime::HierarchicalAllocator; +using executorch::runtime::MemoryAllocator; +using executorch::runtime::MemoryManager; +using executorch::runtime::Method; +using executorch::runtime::MethodMeta; +using executorch::runtime::Program; +using executorch::runtime::Result; +using executorch::runtime::Span; +using executorch::runtime::Tag; +using executorch::runtime::TensorInfo; #define METHOD_ALLOCATOR_POOL_SIZE (70 * 1024 * 1024) unsigned char __attribute__(( section("network_model_sec"), aligned(16))) method_allocation_pool[METHOD_ALLOCATOR_POOL_SIZE]; +const size_t temp_allocation_pool_size = 1 * 1024 * 1024; +unsigned char __attribute__(( + section("network_model_sec"), + aligned(16))) temp_allocation_pool[temp_allocation_pool_size]; + void et_pal_init(void) {} ET_NORETURN void et_pal_abort(void) { @@ -71,24 +94,26 @@ void et_pal_emit_log_message( size_t line, const char* message, ET_UNUSED size_t length) { - fprintf(stderr, "%c executorch:%s:%zu] %s\n", level, filename, line, message); + fprintf( + stderr, "%c [executorch:%s:%zu] %s\n", level, filename, line, message); } namespace { -using namespace torch::executor; -Result prepare_input_tensors( +Result prepare_input_tensors( Method& method, - torch::executor::MemoryAllocator& allocator, + MemoryAllocator& allocator, std::vector>& input_buffers) { MethodMeta method_meta = method.method_meta(); size_t num_inputs = method_meta.num_inputs(); size_t num_allocated = 0; +#ifdef SEMIHOSTING ET_CHECK_OR_RETURN_ERROR( input_buffers.size() > 0 && num_inputs == input_buffers.size(), InvalidArgument, "Wrong number of inputs allocated compared to method"); +#endif void** inputs = static_cast(allocator.allocate(num_inputs * sizeof(void*))); @@ -162,18 +187,18 @@ Result prepare_input_tensors( ET_LOG( Error, "Failed to prepare input %zu: 0x%" PRIx32, i, (uint32_t)err); // The BufferCleanup will free the inputs when it goes out of scope. - util::BufferCleanup cleanup({inputs, num_allocated}); + BufferCleanup cleanup({inputs, num_allocated}); return err; } } - return util::BufferCleanup({inputs, num_allocated}); + return BufferCleanup({inputs, num_allocated}); } #ifdef SEMIHOSTING std::pair read_binary_file( const char* filename, - torch::executor::MemoryAllocator& allocator) { + MemoryAllocator& allocator) { FILE* fp = fopen(filename, "rb"); if (!fp) { ET_LOG( @@ -225,13 +250,13 @@ int main(int argc, const char* argv[]) { (void)argv; #endif - torch::executor::runtime_init(); + executorch::runtime::runtime_init(); std::vector> input_buffers; size_t pte_size = sizeof(model_pte); #ifdef SEMIHOSTING const char* output_basename = nullptr; - torch::executor::MemoryAllocator input_allocator( + MemoryAllocator input_allocator( input_allocation_pool_size, input_allocation_pool); /* parse input parameters */ @@ -264,10 +289,9 @@ int main(int argc, const char* argv[]) { } #endif ET_LOG(Info, "Model in %p %c", model_pte, model_pte[0]); - auto loader = torch::executor::util::BufferDataLoader(model_pte, pte_size); + auto loader = BufferDataLoader(model_pte, pte_size); ET_LOG(Info, "Model PTE file loaded. Size: %lu bytes.", pte_size); - Result program = - torch::executor::Program::load(&loader); + Result program = Program::load(&loader); if (!program.ok()) { ET_LOG( Info, @@ -286,8 +310,7 @@ int main(int argc, const char* argv[]) { } ET_LOG(Info, "Running method %s", method_name); - Result method_meta = - program->method_meta(method_name); + Result method_meta = program->method_meta(method_name); if (!method_meta.ok()) { ET_LOG( Info, @@ -296,13 +319,11 @@ int main(int argc, const char* argv[]) { (unsigned int)method_meta.error()); } - torch::executor::MemoryAllocator method_allocator{ - torch::executor::MemoryAllocator( - METHOD_ALLOCATOR_POOL_SIZE, method_allocation_pool)}; + MemoryAllocator method_allocator( + METHOD_ALLOCATOR_POOL_SIZE, method_allocation_pool); std::vector planned_buffers; // Owns the memory - std::vector> - planned_spans; // Passed to the allocator + std::vector> planned_spans; // Passed to the allocator size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers(); for (size_t id = 0; id < num_memory_planned_buffers; ++id) { @@ -317,14 +338,16 @@ int main(int argc, const char* argv[]) { planned_spans.push_back({planned_buffers.back(), buffer_size}); } - torch::executor::HierarchicalAllocator planned_memory( + HierarchicalAllocator planned_memory( {planned_spans.data(), planned_spans.size()}); - torch::executor::MemoryManager memory_manager( - &method_allocator, &planned_memory); + MemoryAllocator temp_allocator( + temp_allocation_pool_size, temp_allocation_pool); - Result method = - program->load_method(method_name, &memory_manager); + MemoryManager memory_manager( + &method_allocator, &planned_memory, &temp_allocator); + + Result method = program->load_method(method_name, &memory_manager); if (!method.ok()) { ET_LOG( Info, @@ -349,7 +372,10 @@ int main(int argc, const char* argv[]) { ET_LOG(Info, "Input prepared."); ET_LOG(Info, "Starting the model execution..."); + StartMeasurements(); Error status = method->execute(); + StopMeasurements(); + if (status != Error::Ok) { ET_LOG( Info, @@ -360,13 +386,15 @@ int main(int argc, const char* argv[]) { ET_LOG(Info, "Model executed successfully."); } - std::vector outputs(method->outputs_size()); + std::vector outputs(method->outputs_size()); ET_LOG(Info, "%zu outputs: ", outputs.size()); status = method->get_outputs(outputs.data(), outputs.size()); ET_CHECK(status == Error::Ok); for (int i = 0; i < outputs.size(); ++i) { Tensor t = outputs[i].toTensor(); #ifndef SEMIHOSTING + // The output might be collected and parsed so printf() is used instead + // of ET_LOG() here for (int j = 0; j < outputs[i].toTensor().numel(); ++j) { if (t.scalar_type() == ScalarType::Int) { printf( diff --git a/examples/arm/executor_runner/arm_perf_monitor.cpp b/examples/arm/executor_runner/arm_perf_monitor.cpp new file mode 100644 index 00000000000..c53d28baab4 --- /dev/null +++ b/examples/arm/executor_runner/arm_perf_monitor.cpp @@ -0,0 +1,173 @@ +/* Copyright 2024 Arm Limited and/or its affiliates. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include "arm_perf_monitor.h" + +#ifdef ETHOSU +#include +#include +#include + +static uint32_t ethosu_inference_count = 0; +static uint64_t ethosu_ArmBackendExecuteCycleCountStart = 0; +static uint64_t ethosu_ArmBackendExecuteCycleCount = 0; +static uint64_t ethosu_ArmWhenNPURunCycleCountStart = 0; +static uint64_t ethosu_ArmWhenNPURunCycleCount = 0; +static uint64_t ethosu_pmuCycleCount = 0; +static std::vector ethosu_pmuEventCounts( + ETHOSU_PMU_Get_NumEventCounters(), + 0); + +static const uint32_t ethosu_pmuCountersUsed = 4; +// ethosu_pmuCountersUsed should match numbers of counters setup in +// ethosu_inference_begin() and not be more then the HW supports +static_assert(ETHOSU_PMU_NCOUNTERS >= ethosu_pmuCountersUsed); + +extern "C" { + +// Callback invoked at start of NPU execution +void ethosu_inference_begin(struct ethosu_driver* drv, void*) { + // Enable PMU + ETHOSU_PMU_Enable(drv); + ETHOSU_PMU_PMCCNTR_CFG_Set_Stop_Event(drv, ETHOSU_PMU_NPU_IDLE); + ETHOSU_PMU_PMCCNTR_CFG_Set_Start_Event(drv, ETHOSU_PMU_NPU_ACTIVE); + + // Setup 4 counters + ETHOSU_PMU_Set_EVTYPER(drv, 0, ETHOSU_PMU_AXI0_RD_DATA_BEAT_RECEIVED); + ETHOSU_PMU_Set_EVTYPER(drv, 1, ETHOSU_PMU_AXI1_RD_DATA_BEAT_RECEIVED); + ETHOSU_PMU_Set_EVTYPER(drv, 2, ETHOSU_PMU_AXI0_WR_DATA_BEAT_WRITTEN); + ETHOSU_PMU_Set_EVTYPER(drv, 3, ETHOSU_PMU_NPU_IDLE); + // Enable 4 counters + ETHOSU_PMU_CNTR_Enable(drv, 0xf); + + ETHOSU_PMU_CNTR_Enable(drv, ETHOSU_PMU_CCNT_Msk); + ETHOSU_PMU_CYCCNT_Reset(drv); + + // Reset all counters + ETHOSU_PMU_EVCNTR_ALL_Reset(drv); + + // Save Cortex-M cycle clock to calculate total CPU cycles used in + // ethosu_inference_end() + ethosu_ArmWhenNPURunCycleCountStart = ARM_PMU_Get_CCNTR(); +} + +// Callback invoked at end of NPU execution +void ethosu_inference_end(struct ethosu_driver* drv, void*) { + ethosu_inference_count++; + ethosu_pmuCycleCount += ETHOSU_PMU_Get_CCNTR(drv); + + for (size_t i = 0; i < ethosu_pmuCountersUsed; i++) { + ethosu_pmuEventCounts[i] += ETHOSU_PMU_Get_EVCNTR(drv, i); + } + ETHOSU_PMU_Disable(drv); + // Add Cortex-M cycle clock used during this NPU execution + ethosu_ArmWhenNPURunCycleCount += + (ARM_PMU_Get_CCNTR() - ethosu_ArmWhenNPURunCycleCountStart); +} + +// Callback invoked at start of ArmBackend::execute() +void ArmBackend_execute_begin() { + // Save Cortex-M cycle clock to calculate total CPU cycles used in + // ArmBackend_execute_end() + ethosu_ArmBackendExecuteCycleCountStart = ARM_PMU_Get_CCNTR(); +} + +// Callback invoked at end of ArmBackend::execute() +void ArmBackend_execute_end() { + // Add Cortex-M cycle clock used during this ArmBackend::execute() + ethosu_ArmBackendExecuteCycleCount += + (ARM_PMU_Get_CCNTR() - ethosu_ArmBackendExecuteCycleCountStart); +} +} + +void StartMeasurements() { + ethosu_ArmBackendExecuteCycleCount = 0; + ethosu_ArmWhenNPURunCycleCount = 0; + ethosu_pmuCycleCount = 0; + + for (size_t i = 0; i < ethosu_pmuCountersUsed; i++) { + ethosu_pmuEventCounts[i] = 0; + } + ARM_PMU_Enable(); + DCB->DEMCR |= DCB_DEMCR_TRCENA_Msk; // Trace enable + ARM_PMU_CYCCNT_Reset(); + ARM_PMU_CNTR_Enable(PMU_CNTENSET_CCNTR_ENABLE_Msk); +} + +void StopMeasurements() { + ARM_PMU_CNTR_Disable( + PMU_CNTENCLR_CCNTR_ENABLE_Msk | PMU_CNTENCLR_CNT0_ENABLE_Msk | + PMU_CNTENCLR_CNT1_ENABLE_Msk); + uint32_t cycle_count = ARM_PMU_Get_CCNTR(); + + // Number of comand streams handled by the NPU + ET_LOG(Info, "NPU Inferences : %d", ethosu_inference_count); + ET_LOG(Info, "Profiler report, CPU cycles per operator:"); + // This is number of CPU cycles for the ethos-u operator from start to finish + // in the framework If there is more then one commandstream the time is added + // together + ET_LOG( + Info, + "ethos-u : cycle_cnt : %d cycles", + ethosu_ArmBackendExecuteCycleCount); + // We could print a list of the cycles used by the other delegates here in the + // future but now we only print ethos-u: this means that "Operator(s) total: + // ..." will be the same number as ethos-u : cycle_cnt and not the sum of all + ET_LOG( + Info, + "Operator(s) total: %d CPU cycles", + ethosu_ArmBackendExecuteCycleCount); + // Total CPU cycles used in the executorch method->execute() + // Other delegates and no delegates are counted in this + ET_LOG(Info, "Inference runtime: %d CPU cycles total", cycle_count); + + ET_LOG( + Info, + "NOTE: CPU cycle values and ratio calculations require FPGA and identical CPU/NPU frequency"); + + // Avoid division with zero if ARM_PMU_Get_CCNTR() is not enabled properly. + if (cycle_count == 0) { + ET_LOG(Info, "Inference CPU ratio: ?.?? %%"); + ET_LOG(Info, "Inference NPU ratio: ?.?? %%"); + } else { + ET_LOG( + Info, + "Inference CPU ratio: %.2f %%", + 100.0 * (cycle_count - ethosu_ArmWhenNPURunCycleCount) / cycle_count); + ET_LOG( + Info, + "Inference NPU ratio: %.2f %%", + 100.0 * ethosu_ArmWhenNPURunCycleCount / cycle_count); + } + + // CPU cycles used by NPU, e.g. number of CPU cycles used between + // ethosu_inference_begin() and ethosu_inference_end() + // If there is more then one commandstream the time is added together + ET_LOG( + Info, + "cpu_wait_for_npu_cntr : %" PRIu64 " CPU cycles", + ethosu_ArmWhenNPURunCycleCount); + + ET_LOG(Info, "Ethos-U PMU report:"); + ET_LOG(Info, "ethosu_pmu_cycle_cntr : %" PRIu64, ethosu_pmuCycleCount); + + for (size_t i = 0; i < ethosu_pmuCountersUsed; i++) { + ET_LOG(Info, "ethosu_pmu_cntr%zd : %" PRIu64, i, ethosu_pmuEventCounts[i]); + } + ET_LOG( + Info, + "Ethos-U PMU Events:[ETHOSU_PMU_AXI0_RD_DATA_BEAT_RECEIVED, ETHOSU_PMU_AXI1_RD_DATA_BEAT_RECEIVED, ETHOSU_PMU_AXI0_WR_DATA_BEAT_WRITTEN, ETHOSU_PMU_NPU_IDLE]"); +} + +#else +void StartMeasurements() {} + +void StopMeasurements() {} + +#endif diff --git a/examples/arm/executor_runner/arm_perf_monitor.h b/examples/arm/executor_runner/arm_perf_monitor.h new file mode 100644 index 00000000000..3925a9a5713 --- /dev/null +++ b/examples/arm/executor_runner/arm_perf_monitor.h @@ -0,0 +1,10 @@ +/* Copyright 2024 Arm Limited and/or its affiliates. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +void StartMeasurements(); +void StopMeasurements(); diff --git a/examples/arm/run.sh b/examples/arm/run.sh index f41e0ef50c6..4a3f6dbf672 100755 --- a/examples/arm/run.sh +++ b/examples/arm/run.sh @@ -147,6 +147,10 @@ function build_executorch_runner() { cmake --build cmake-out -- -j"$((n - 5))" arm_executor_runner echo "[${FUNCNAME[0]}] Generated baremetal elf file:" find cmake-out -name "arm_executor_runner" + echo "executable_text: $(find -name arm_executor_runner -exec size {} \; | grep -v filename | awk '{print $1}') bytes" + echo "executable_data: $(find -name arm_executor_runner -exec size {} \; | grep -v filename | awk '{print $2}') bytes" + echo "executable_bss: $(find -name arm_executor_runner -exec size {} \; | grep -v filename | awk '{print $3}') bytes" + echo "pte_data_size: $(stat -c%s ${pte}) bytes" } # Execute the executor_runner on FVP Simulator diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh index 272ddcfc0c5..3d99143d27b 100755 --- a/examples/arm/setup.sh +++ b/examples/arm/setup.sh @@ -91,6 +91,7 @@ fi ### Optional user args ######## root_dir=${1:-"${script_dir}/ethos-u-scratch"} +mkdir -p ${root_dir} root_dir=$(realpath ${root_dir}) ######## @@ -215,7 +216,7 @@ function setup_vela() { if [[ ! -e ethos-u-vela ]]; then git clone https://review.mlplatform.org/ml/ethos-u/ethos-u-vela repo_dir="${root_dir}/ethos-u-vela" - base_rev=7706c1281166e7611f4300ed26338087152a33c9 + base_rev=fe0eaa55c5ed319f78c01978f3b40eb11a9bcb38 patch_repo fi cd "${root_dir}/ethos-u-vela" @@ -246,7 +247,6 @@ fi cd "${script_dir}" # Setup the root dir -mkdir -p "${root_dir}" cd "${root_dir}" echo "[main] Using root dir ${root_dir}" diff --git a/examples/demo-apps/android/ExecuTorchDemo/README.md b/examples/demo-apps/android/ExecuTorchDemo/README.md index 89d8c34ee39..a60307dd90f 100644 --- a/examples/demo-apps/android/ExecuTorchDemo/README.md +++ b/examples/demo-apps/android/ExecuTorchDemo/README.md @@ -53,7 +53,7 @@ For delegating to Qualcomm Hexagon NPU, please follow the tutorial [here](build- After generating the model, copy the model to `assets` directory. ```bash -python -m examples.qualcomm.scripts.deeplab_v3 -b cmake-out-android -m SM8450 -s +python -m examples.qualcomm.scripts.deeplab_v3 -b build-android -m SM8450 -s cp deeplab_v3/dlv3_qnn.pte examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/ ``` @@ -78,6 +78,8 @@ cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -Bcmake-android-out cmake --build cmake-android-out -j16 --target install @@ -119,6 +121,8 @@ cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \ -DQNN_SDK_ROOT="${QNN_SDK_ROOT}" \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -Bcmake-android-out cmake --build cmake-android-out -j16 --target install diff --git a/examples/demo-apps/android/ExecuTorchDemo/app/src/main/BUCK b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/BUCK new file mode 100644 index 00000000000..2b33cef732a --- /dev/null +++ b/examples/demo-apps/android/ExecuTorchDemo/app/src/main/BUCK @@ -0,0 +1,67 @@ +load("@fbsource//tools/build_defs:manifold.bzl", "manifold_get") +load("@fbsource//tools/build_defs/android:fb_android_binary.bzl", "fb_android_binary") +load("@fbsource//tools/build_defs/android:fb_android_library.bzl", "fb_android_library") +load("@fbsource//tools/build_defs/android:fb_android_resource.bzl", "fb_android_resource") + +manifold_get( + name = "dl3_xnnpack_fp32", + out = "dl3_xnnpack_fp32.pte", + api_key = "executorch-key", + artifact_path = "tree/models/benchmarking/executorch/dl3_xnnpack_fp32.pte", + bucket_name = "executorch", + sha1 = "3e7af1d8f5ec4acb6de156d361715e16e5f53783", + timeout_msec = 120000, +) + +fb_android_resource( + name = "app_res", + assets = "assets", + package = "com.example.executorchdemo", + res = "res", +) + +fb_android_resource( + name = "model_res", + assets = {"dl3_xnnpack_fp32.pte": ":dl3_xnnpack_fp32"}, + package = "com.example.executorchdemo", + res = "res", +) + +fb_android_library( + name = "app_lib", + srcs = [ + "java/com/example/executorchdemo/ClassificationActivity.java", + "java/com/example/executorchdemo/ImageNetClasses.java", + "java/com/example/executorchdemo/MainActivity.java", + "java/com/example/executorchdemo/TensorImageUtils.java", + ], + autoglob = False, + language = "JAVA", + deps = [ + ":app_res", + "//xplat/executorch/extension/android:executorch", + ], +) + +fb_android_binary( + name = "ExecuTorchDemo", + keystore = "//fbandroid/keystores:debug", + manifest = "AndroidManifest.xml", + manifest_entries = { + "min_sdk_version": 19, # Android supports 19 for minimum + "target_sdk_version": 34, + "version_code": "1", + "version_name": "1.0", + }, + package_type = "release", + skip_proguard = True, + deps = [ + ":app_lib", + ":app_res", + ":model_res", + "//third-party/java/androidx/appcompat/appcompat:appcompat", + "//third-party/java/androidx/constraintlayout/constraintlayout:constraintlayout", + "//xplat/executorch/extension/android:executorch", + "//xplat/executorch/extension/android/jni:executorch_jni_full", + ], +) diff --git a/examples/demo-apps/android/ExecuTorchDemo/setup.sh b/examples/demo-apps/android/ExecuTorchDemo/setup.sh index 05dc3e4492e..00d9201b092 100644 --- a/examples/demo-apps/android/ExecuTorchDemo/setup.sh +++ b/examples/demo-apps/android/ExecuTorchDemo/setup.sh @@ -15,6 +15,7 @@ cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TESNOR=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DCMAKE_BUILD_TYPE=Release \ -B"${CMAKE_OUT}" diff --git a/examples/demo-apps/android/LlamaDemo/README.md b/examples/demo-apps/android/LlamaDemo/README.md index 7bb36657da3..cfb66538269 100644 --- a/examples/demo-apps/android/LlamaDemo/README.md +++ b/examples/demo-apps/android/LlamaDemo/README.md @@ -1,111 +1,139 @@ -# Building ExecuTorch LLaMA Android Demo App - -This app demonstrates the use of the LLaMA chat app demonstrating local inference use case with ExecuTorch. - -## Prerequisites -* Set up your ExecuTorch repo and environment if you haven’t done so by following the [Setting up ExecuTorch](https://pytorch.org/executorch/stable/getting-started-setup) to set up the repo and dev environment. -* Install [Java 17 JDK](https://www.oracle.com/java/technologies/javase/jdk17-archive-downloads.html). -* Install the [Android SDK API Level 34](https://developer.android.com/about/versions/14/setup-sdk) and - [Android NDK 25.0.8775105](https://developer.android.com/studio/projects/install-ndk). - * If you have Android Studio set up, you can install them with - * Android Studio Settings -> Language & Frameworks -> Android SDK -> SDK Platforms -> Check the row with API Level 34. - * Android Studio Settings -> Language & Frameworks -> Android SDK -> SDK Tools -> Check NDK (Side by side) row. - * Alternatively, you can follow [this guide](https://github.com/pytorch/executorch/blob/856e085b9344c8b0bf220a97976140a5b76356aa/examples/demo-apps/android/LlamaDemo/SDK.md) to set up Java/SDK/NDK with CLI. -* Supported Host OS: CentOS, macOS Sonoma on Apple Silicon. - -Note: This demo app and tutorial has only been validated with arm64-v8a [ABI](https://developer.android.com/ndk/guides/abis), with NDK 25.0.8775105. - -## Getting models -Please refer to the [ExecuTorch Llama2 docs](https://github.com/pytorch/executorch/blob/main/examples/models/llama2/README.md) to export the model. - -After you export the model and generate tokenizer.bin, push them device: -```bash -adb shell mkdir -p /data/local/tmp/llama -adb push llama2.pte /data/local/tmp/llama -adb push tokenizer.bin /data/local/tmp/llama -``` +# ExecuTorch Llama Android Demo App -Note: The demo app searches in `/data/local/tmp/llama` for .pte and .bin files as LLAMA model and tokenizer. +We’re excited to share that the newly revamped Android demo app is live and includes many new updates to provide a more intuitive and smoother user experience with a chat use case! The primary goal of this app is to showcase how easily ExecuTorch can be integrated into an Android demo app and how to exercise the many features ExecuTorch and Llama models have to offer. -## Build library -For the demo app to build, we need to build the ExecuTorch AAR library first. +This app serves as a valuable resource to inspire your creativity and provide foundational code that you can customize and adapt for your particular use case. -The AAR library contains the required Java package and the corresponding JNI -library for using ExecuTorch in your Android app. +Please dive in and start exploring our demo app today! We look forward to any feedback and are excited to see your innovative ideas. -### Alternative 1: Use prebuilt AAR library (recommended) -1. Open a terminal window and navigate to the root directory of the `executorch`. -2. Run the following command to download the prebuilt library: -```bash -bash examples/demo-apps/android/LlamaDemo/download_prebuilt_lib.sh -``` +## Key Concepts +From this demo app, you will learn many key concepts such as: +* How to prepare Llama models, build the ExecuTorch library, and model inferencing across delegates +* Expose the ExecuTorch library via JNI layer +* Familiarity with current ExecuTorch app-facing capabilities -The prebuilt AAR library contains the Java library and the JNI binding for -NativePeer.java and ExecuTorch native library, including core ExecuTorch -runtime libraries, XNNPACK backend, Portable kernels, Optimized kernels, -and Quantized kernels. It comes with two ABI variants, arm64-v8a and x86_64. +The goal is for you to see the type of support ExecuTorch provides and feel comfortable with leveraging it for your use cases. -If you want to use the prebuilt library for your own app, please refer to -[Using Android prebuilt libraries (AAR)](./android-prebuilt-library.md) for -tutorial. +## Supporting Models +As a whole, the models that this app supports are (varies by delegate): +* Llama 3.1 8B +* Llama 3 8B +* Llama 2 7B +* LLaVA-1.5 vision model (only XNNPACK) -If you need to use other dependencies (like tokenizer), please refer to -Alternative 2: Build from local machine option. -### Alternative 2: Build from local machine -1. Open a terminal window and navigate to the root directory of the `executorch`. -2. Set the following environment variables: -```bash -export ANDROID_NDK= -export ANDROID_ABI=arm64-v8a -``` -Note: `` is the root for the NDK, which is usually under -`~/Library/Android/sdk/ndk/XX.Y.ZZZZZ` for macOS, and contains NOTICE and README.md. -We use `/build/cmake/android.toolchain.cmake` for CMake to cross-compile. +## Building the APK +First it’s important to note that currently ExecuTorch provides support across 3 delegates. Once you identify the delegate of your choice, select the README link to get a complete end-to-end instructions for environment set-up to exporting the models to build ExecuTorch libraries and apps to run on device: -3. (Optional) If you need to use tiktoken as the tokenizer (for LLaMA3), set -`EXECUTORCH_USE_TIKTOKEN=ON` and later CMake will use it as the tokenizer. -If you need to run other models like LLaMA2, skip this skip. +| Delegate | Resource | +| ------------- | ------------- | +| XNNPACK (CPU-based library) | [link](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md) | +| QNN (Qualcomm AI Accelerators) | [link](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md) | +| MediaTek (MediaTek AI Accelerators) | [link](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md) | -```bash -export EXECUTORCH_USE_TIKTOKEN=ON # Only for LLaMA3 -``` +## How to Use the App -4. Build the Android Java extension code: -```bash -pushd extension/android -./gradlew build -popd -``` +This section will provide the main steps to use the app, along with a code snippet of the ExecuTorch API. -5. Run the following command set up the required JNI library: -```bash -pushd examples/demo-apps/android/LlamaDemo -./gradlew :app:setup -popd -``` -This is running the shell script [setup.sh](./setup.sh) which configures the required core ExecuTorch, LLAMA2, and Android libraries, builds them, and copy to jniLibs. - -## Build APK -### Alternative 1: Android Studio (Recommended) +For loading the app, development, and running on device we recommend Android Studio: 1. Open Android Studio and select "Open an existing Android Studio project" to open examples/demo-apps/android/LlamaDemo. 2. Run the app (^R). This builds and launches the app on the phone. -### Alternative 2: Command line -Without Android Studio UI, we can run gradle directly to build the app. We need to set up the Android SDK path and invoke gradle. -```bash -export ANDROID_HOME= -pushd examples/demo-apps/android/LlamaDemo -./gradlew :app:installDebug -popd +### Opening the App + +Below are the UI features for the app. + +Select the settings widget to get started with picking a model, its parameters and any prompts. +

+ +

+ + + +### Select Models and Parameters + +Once you've selected the model, tokenizer, and model type you are ready to click on "Load Model" to have the app load the model and go back to the main Chat activity. +

+ +

+ + + +Optional Parameters: +* Temperature: Defaulted to 0, you can adjust the temperature for the model as well. The model will reload upon any adjustments. +* System Prompt: Without any formatting, you can enter in a system prompt. For example, "you are a travel assistant" or "give me a response in a few sentences". +* User Prompt: More for the advanced user, if you would like to manually input a prompt then you can do so by modifying the `{{user prompt}}`. You can also modify the special tokens as well. Once changed then go back to the main Chat activity to send. + +#### ExecuTorch App API + +```java +// Upon returning to the Main Chat Activity +mModule = new LlamaModule( + ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType()), + modelPath, + tokenizerPath, + temperature); +int loadResult = mModule.load(); +``` + +* `modelCategory`: Indicate whether it’s a text-only or vision model +* `modePath`: path to the .pte file +* `tokenizerPath`: path to the tokenizer .bin file +* `temperature`: model parameter to adjust the randomness of the model’s output + + +### User Prompt +Once model is successfully loaded then enter any prompt and click the send (i.e. generate) button to send it to the model. +

+ +

+ +You can provide it more follow-up questions as well. +

+ +

+ +#### ExecuTorch App API + +```java +mModule.generate(prompt,sequence_length, MainActivity.this); ``` +* `prompt`: User formatted prompt +* `sequence_length`: Number of tokens to generate in response to a prompt +* `MainActivity.this`: Indicate that the callback functions (OnResult(), OnStats()) are present in this class. -On the phone or emulator, you can try running the model: -Android LLaMA App
+[*LLaVA-1.5: Only for XNNPACK delegate*] -## Takeaways -Through this tutorial we've learnt how to build the ExecuTorch LLAMA library, and expose it to JNI layer to build the Android app. +For LLaVA-1.5 implementation, select the exported LLaVA .pte and tokenizer file in the Settings menu and load the model. After this you can send an image from your gallery or take a live picture along with a text prompt to the model. + +

+ +

+ + +### Output Generated +To show completion of the follow-up question, here is the complete detailed response from the model. +

+ +

+ +#### ExecuTorch App API + +Ensure you have the following functions in your callback class that you provided in the `mModule.generate()`. For this example, it is `MainActivity.this`. +```java + @Override + public void onResult(String result) { + //...result contains token from response + //.. onResult will continue to be invoked until response is complete + } + + @Override + public void onStats(float tps) { + //...tps (tokens per second) stats is provided by framework + } + +``` ## Reporting Issues If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new). diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/BUCK b/examples/demo-apps/android/LlamaDemo/app/src/main/BUCK new file mode 100644 index 00000000000..80315c4104b --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/BUCK @@ -0,0 +1,65 @@ +load("@fbsource//tools/build_defs/android:fb_android_binary.bzl", "fb_android_binary") +load("@fbsource//tools/build_defs/android:fb_android_library.bzl", "fb_android_library") +load("@fbsource//tools/build_defs/android:fb_android_resource.bzl", "fb_android_resource") + +oncall("executorch") + +fb_android_resource( + name = "app_res", + package = "com.example.executorchllamademo", + res = "res", +) + +fb_android_library( + name = "app_lib", + srcs = [ + "java/com/example/executorchllamademo/AppLog.java", + "java/com/example/executorchllamademo/DemoSharedPreferences.java", + "java/com/example/executorchllamademo/ETImage.java", + "java/com/example/executorchllamademo/ETLogging.java", + "java/com/example/executorchllamademo/LlmBenchmarkRunner.java", + "java/com/example/executorchllamademo/LogsActivity.java", + "java/com/example/executorchllamademo/LogsAdapter.java", + "java/com/example/executorchllamademo/MainActivity.java", + "java/com/example/executorchllamademo/Message.java", + "java/com/example/executorchllamademo/MessageAdapter.java", + "java/com/example/executorchllamademo/MessageType.java", + "java/com/example/executorchllamademo/ModelRunner.java", + "java/com/example/executorchllamademo/ModelRunnerCallback.java", + "java/com/example/executorchllamademo/ModelType.java", + "java/com/example/executorchllamademo/ModelUtils.java", + "java/com/example/executorchllamademo/PromptFormat.java", + "java/com/example/executorchllamademo/SettingsActivity.java", + "java/com/example/executorchllamademo/SettingsFields.java", + ], + autoglob = False, + language = "JAVA", + deps = [ + ":app_res", + "//third-party/java/androidx/constraintlayout/constraintlayout:constraintlayout", + "//third-party/java/com/google/code/gson/gson:gson", + "//xplat/executorch/extension/android:executorch_llama", + ], +) + +fb_android_binary( + name = "ExecuTorchLlamaDemo", + keystore = "//fbandroid/keystores:debug", + manifest = "AndroidManifest.xml", + manifest_entries = { + "min_sdk_version": 21, + "target_sdk_version": 34, + "version_code": "1", + "version_name": "1.0", + }, + package_type = "release", + skip_proguard = True, + deps = [ + ":app_lib", + ":app_res", + "//third-party/java/androidx/appcompat/appcompat:appcompat", + "//third-party/java/com/google/code/gson/gson:gson", + "//xplat/executorch/extension/android:executorch_llama", + "//xplat/executorch/extension/android/jni:executorch_llama_jni", + ], +) diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETImage.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETImage.java index cf3c3e5f0a5..e68c8472626 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETImage.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETImage.java @@ -46,6 +46,16 @@ public byte[] getBytes() { return bytes; } + public int[] getInts() { + // We need to convert the byte array to an int array because + // the runner expects an int array as input. + int[] intArray = new int[bytes.length]; + for (int i = 0; i < bytes.length; i++) { + intArray[i] = (bytes[i++] & 0xFF); + } + return intArray; + } + private byte[] getBytesFromImageURI(Uri uri) { try { int RESIZED_IMAGE_WIDTH = 336; @@ -72,9 +82,9 @@ private byte[] getBytesFromImageURI(Uri uri) { int blue = Color.blue(color); // Store the RGB values in the byte array - rgbValues[(y * width + x) * 3] = (byte) red; - rgbValues[(y * width + x) * 3 + 1] = (byte) green; - rgbValues[(y * width + x) * 3 + 2] = (byte) blue; + rgbValues[y * width + x] = (byte) red; + rgbValues[(y * width + x) + height * width] = (byte) green; + rgbValues[(y * width + x) + 2 * height * width] = (byte) blue; } } return rgbValues; diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java index 33b230b1dff..7236fe317b0 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java @@ -9,13 +9,22 @@ package com.example.executorchllamademo; import android.app.Activity; +import android.app.ActivityManager; import android.content.Intent; +import android.os.Build; import android.os.Bundle; import android.util.Log; import android.widget.TextView; import androidx.annotation.NonNull; +import com.google.gson.Gson; +import java.io.File; import java.io.FileWriter; import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; public class LlmBenchmarkRunner extends Activity implements ModelRunnerCallback { ModelRunner mModelRunner; @@ -32,7 +41,12 @@ protected void onCreate(Bundle savedInstanceState) { Intent intent = getIntent(); - String modelPath = intent.getStringExtra("model_path"); + File modelDir = new File(intent.getStringExtra("model_dir")); + File model = + Arrays.stream(modelDir.listFiles()) + .filter(file -> file.getName().endsWith(".pte")) + .findFirst() + .get(); String tokenizerPath = intent.getStringExtra("tokenizer_path"); float temperature = intent.getFloatExtra("temperature", 0.8f); @@ -42,19 +56,21 @@ protected void onCreate(Bundle savedInstanceState) { } mStatsDump = new StatsDump(); - mModelRunner = new ModelRunner(modelPath, tokenizerPath, temperature, this); - mStatsDump.loadStart = System.currentTimeMillis(); + mStatsDump.modelName = model.getName().replace(".pte", ""); + mModelRunner = new ModelRunner(model.getPath(), tokenizerPath, temperature, this); + mStatsDump.loadStart = System.nanoTime(); } @Override public void onModelLoaded(int status) { - mStatsDump.loadEnd = System.currentTimeMillis(); + mStatsDump.loadEnd = System.nanoTime(); + mStatsDump.loadStatus = status; if (status != 0) { Log.e("LlmBenchmarkRunner", "Loaded failed: " + status); onGenerationStopped(); return; } - mStatsDump.generateStart = System.currentTimeMillis(); + mStatsDump.generateStart = System.nanoTime(); mModelRunner.generate(mPrompt); } @@ -73,26 +89,122 @@ public void onStats(String stats) { @Override public void onGenerationStopped() { - mStatsDump.generateEnd = System.currentTimeMillis(); + mStatsDump.generateEnd = System.nanoTime(); runOnUiThread( () -> { mTextView.append(mStatsDump.toString()); }); - try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.txt")) { - writer.write(mStatsDump.toString()); + final BenchmarkMetric.BenchmarkModel benchmarkModel = + BenchmarkMetric.extractBackendAndQuantization(mStatsDump.modelName); + final List results = new ArrayList<>(); + // The list of metrics we have atm includes: + // Load status + results.add(new BenchmarkMetric(benchmarkModel, "load_status", mStatsDump.loadStatus, 0)); + // Model load time + results.add( + new BenchmarkMetric( + benchmarkModel, + "model_load_time(ms)", + (mStatsDump.loadEnd - mStatsDump.loadStart) * 1e-6, + 0.0f)); + // LLM generate time + results.add( + new BenchmarkMetric( + benchmarkModel, + "generate_time(ms)", + (mStatsDump.generateEnd - mStatsDump.generateStart) * 1e-6, + 0.0f)); + // Token per second + results.add( + new BenchmarkMetric(benchmarkModel, "token_per_sec", extractTPS(mStatsDump.tokens), 0.0f)); + + try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.json")) { + Gson gson = new Gson(); + writer.write(gson.toJson(results)); } catch (IOException e) { e.printStackTrace(); } } + + private double extractTPS(final String tokens) { + final Matcher m = Pattern.compile("\\d+\\.?\\d*").matcher(tokens); + if (m.find()) { + return Double.parseDouble(m.group()); + } else { + return 0.0f; + } + } +} + +class BenchmarkMetric { + public static class BenchmarkModel { + // The model name, i.e. stories110M + String name; + String backend; + String quantization; + + public BenchmarkModel(final String name, final String backend, final String quantization) { + this.name = name; + this.backend = backend; + this.quantization = quantization; + } + } + + BenchmarkModel benchmarkModel; + + // The metric name, i.e. TPS + String metric; + + // The actual value and the option target value + double actualValue; + double targetValue; + + public static class DeviceInfo { + // Let's see which information we want to include here + final String device = Build.BRAND; + // The phone model and Android release version + final String arch = Build.MODEL; + final String os = "Android " + Build.VERSION.RELEASE; + final long totalMem = new ActivityManager.MemoryInfo().totalMem; + final long availMem = new ActivityManager.MemoryInfo().availMem; + } + + DeviceInfo deviceInfo = new DeviceInfo(); + + public BenchmarkMetric( + final BenchmarkModel benchmarkModel, + final String metric, + final double actualValue, + final double targetValue) { + this.benchmarkModel = benchmarkModel; + this.metric = metric; + this.actualValue = actualValue; + this.targetValue = targetValue; + } + + // TODO (huydhn): Figure out a way to extract the backend and quantization information from + // the .pte model itself instead of parsing its name + public static BenchmarkMetric.BenchmarkModel extractBackendAndQuantization(final String model) { + final Matcher m = + Pattern.compile("(?\\w+)_(?\\w+)_(?\\w+)").matcher(model); + if (m.matches()) { + return new BenchmarkMetric.BenchmarkModel( + m.group("name"), m.group("backend"), m.group("quantization")); + } else { + return new BenchmarkMetric.BenchmarkModel(model, "", ""); + } + } } class StatsDump { + int loadStatus; long loadStart; long loadEnd; long generateStart; long generateEnd; String tokens; + String modelName; @NonNull @Override diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsActivity.java index 8700528d44a..7777b275e6e 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsActivity.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsActivity.java @@ -10,10 +10,12 @@ import android.app.AlertDialog; import android.content.DialogInterface; +import android.os.Build; import android.os.Bundle; import android.widget.ImageButton; import android.widget.ListView; import androidx.appcompat.app.AppCompatActivity; +import androidx.core.content.ContextCompat; import androidx.core.graphics.Insets; import androidx.core.view.ViewCompat; import androidx.core.view.WindowInsetsCompat; @@ -26,6 +28,10 @@ public class LogsActivity extends AppCompatActivity { protected void onCreate(Bundle savedInstanceState) { super.onCreate(savedInstanceState); setContentView(R.layout.activity_logs); + if (Build.VERSION.SDK_INT >= 21) { + getWindow().setStatusBarColor(ContextCompat.getColor(this, R.color.status_bar)); + getWindow().setNavigationBarColor(ContextCompat.getColor(this, R.color.nav_bar)); + } ViewCompat.setOnApplyWindowInsetsListener( requireViewById(R.id.main), (v, insets) -> { diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java index 70936e17d84..524b4fbc8a8 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java @@ -16,9 +16,11 @@ import android.content.Intent; import android.content.pm.PackageManager; import android.net.Uri; +import android.os.Build; import android.os.Bundle; import android.os.Handler; import android.os.Looper; +import android.os.Process; import android.provider.MediaStore; import android.system.ErrnoException; import android.system.Os; @@ -44,6 +46,8 @@ import java.lang.reflect.Type; import java.util.ArrayList; import java.util.List; +import java.util.concurrent.Executor; +import java.util.concurrent.Executors; import org.pytorch.executorch.LlamaCallback; import org.pytorch.executorch.LlamaModule; @@ -70,11 +74,25 @@ public class MainActivity extends AppCompatActivity implements Runnable, LlamaCa private SettingsFields mCurrentSettingsFields; private Handler mMemoryUpdateHandler; private Runnable memoryUpdater; + private int promptID = 0; + private long startPos = 0; + private static final int CONVERSATION_HISTORY_MESSAGE_LOOKBACK = 2; + private Executor executor; @Override public void onResult(String result) { - mResultMessage.appendText(result); - run(); + if (result.equals(PromptFormat.getStopToken(mCurrentSettingsFields.getModelType()))) { + return; + } + if (result.equals("\n\n") || result.equals("\n")) { + if (!mResultMessage.getText().isEmpty()) { + mResultMessage.appendText(result); + run(); + } + } else { + mResultMessage.appendText(result); + run(); + } } @Override @@ -102,7 +120,12 @@ private void setLocalModel(String modelPath, String tokenizerPath, float tempera mMessageAdapter.notifyDataSetChanged(); }); long runStartTime = System.currentTimeMillis(); - mModule = new LlamaModule(modelPath, tokenizerPath, temperature); + mModule = + new LlamaModule( + ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType()), + modelPath, + tokenizerPath, + temperature); int loadResult = mModule.load(); long loadDuration = System.currentTimeMillis() - runStartTime; String modelLoadError = ""; @@ -132,6 +155,12 @@ private void setLocalModel(String modelPath, String tokenizerPath, float tempera + (float) loadDuration / 1000 + " sec." + " You can send text or image for inference"; + + if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) { + ETLogging.getInstance().log("Llava start prefill prompt"); + startPos = mModule.prefillPrompt(PromptFormat.getLlavaPresetPrompt(), 0, 1, 0); + ETLogging.getInstance().log("Llava completes prefill prompt"); + } } Message modelLoadedMessage = new Message(modelInfo, false, MessageType.SYSTEM, 0); @@ -180,11 +209,21 @@ private void populateExistingMessages(String existingMsgJSON) { mMessageAdapter.notifyDataSetChanged(); } + private int setPromptID() { + + return mMessageAdapter.getMaxPromptID() + 1; + } + @Override protected void onCreate(Bundle savedInstanceState) { super.onCreate(savedInstanceState); setContentView(R.layout.activity_main); + if (Build.VERSION.SDK_INT >= 21) { + getWindow().setStatusBarColor(ContextCompat.getColor(this, R.color.status_bar)); + getWindow().setNavigationBarColor(ContextCompat.getColor(this, R.color.nav_bar)); + } + try { Os.setenv("ADSP_LIBRARY_PATH", getApplicationInfo().nativeLibraryDir, true); } catch (ErrnoException e) { @@ -201,6 +240,7 @@ protected void onCreate(Bundle savedInstanceState) { String existingMsgJSON = mDemoSharedPreferences.getSavedMessages(); if (!existingMsgJSON.isEmpty()) { populateExistingMessages(existingMsgJSON); + promptID = setPromptID(); } mSettingsButton = requireViewById(R.id.settings); mSettingsButton.setOnClickListener( @@ -217,6 +257,7 @@ protected void onCreate(Bundle savedInstanceState) { setupCameraRoll(); startMemoryUpdate(); setupShowLogsButton(); + executor = Executors.newSingleThreadExecutor(); } @Override @@ -522,6 +563,32 @@ private void showMediaPreview(List uris) { imageViews.get(i).setVisibility(View.VISIBLE); imageViews.get(i).setImageURI(mSelectedImageUri.get(i)); } + + // For LLava, we want to call prefill_image as soon as an image is selected + // Llava only support 1 image for now + if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) { + List processedImageList = getProcessedImagesForModel(mSelectedImageUri); + if (!processedImageList.isEmpty()) { + mMessageAdapter.add( + new Message("Llava - Starting image Prefill.", false, MessageType.SYSTEM, 0)); + mMessageAdapter.notifyDataSetChanged(); + Runnable runnable = + () -> { + Process.setThreadPriority(Process.THREAD_PRIORITY_MORE_FAVORABLE); + ETLogging.getInstance().log("Starting runnable prefill image"); + ETImage img = processedImageList.get(0); + ETLogging.getInstance().log("Llava start prefill image"); + startPos = + mModule.prefillImages( + img.getInts(), + img.getWidth(), + img.getHeight(), + ModelUtils.VISION_MODEL_IMAGE_CHANNELS, + startPos); + }; + executor.execute(runnable); + } + } } private void addSelectedImagesToChatThread(List selectedImageUri) { @@ -537,6 +604,48 @@ private void addSelectedImagesToChatThread(List selectedImageUri) { mMessageAdapter.notifyDataSetChanged(); } + private String getConversationHistory() { + String conversationHistory = ""; + + ArrayList conversations = + mMessageAdapter.getRecentSavedTextMessages(CONVERSATION_HISTORY_MESSAGE_LOOKBACK); + if (conversations.isEmpty()) { + return conversationHistory; + } + + int prevPromptID = conversations.get(0).getPromptID(); + String conversationFormat = + PromptFormat.getConversationFormat(mCurrentSettingsFields.getModelType()); + String format = conversationFormat; + for (int i = 0; i < conversations.size(); i++) { + Message conversation = conversations.get(i); + int currentPromptID = conversation.getPromptID(); + if (currentPromptID != prevPromptID) { + conversationHistory = conversationHistory + format; + format = conversationFormat; + prevPromptID = currentPromptID; + } + if (conversation.getIsSent()) { + format = format.replace(PromptFormat.USER_PLACEHOLDER, conversation.getText()); + } else { + format = format.replace(PromptFormat.ASSISTANT_PLACEHOLDER, conversation.getText()); + } + } + conversationHistory = conversationHistory + format; + + return conversationHistory; + } + + private String getTotalFormattedPrompt(String conversationHistory, String rawPrompt) { + if (conversationHistory.isEmpty()) { + return mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt); + } + + return mCurrentSettingsFields.getFormattedSystemPrompt() + + conversationHistory + + mCurrentSettingsFields.getFormattedUserPrompt(rawPrompt); + } + private void onModelRunStarted() { mSendButton.setClickable(false); mSendButton.setImageResource(R.drawable.baseline_stop_24); @@ -552,44 +661,33 @@ private void onModelRunStopped() { mSendButton.setOnClickListener( view -> { addSelectedImagesToChatThread(mSelectedImageUri); - // TODO: When ET supports multimodal, this is where we will add the images as part of the - // prompt. - List processedImageList = getProcessedImagesForModel(mSelectedImageUri); - processedImageList.forEach( - image -> { - ETLogging.getInstance() - .log( - "Image preprocessed:" - + " uri = " - + image.getUri().getLastPathSegment() - + "," - + " width = " - + image.getWidth() - + "," - + " height = " - + image.getHeight() - + "," - + " bytes size = " - + image.getBytes().length); - }); + String finalPrompt; String rawPrompt = mEditTextMessage.getText().toString(); - String prompt = mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt); + if (ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType()) + == ModelUtils.VISION_MODEL) { + finalPrompt = mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt); + } else { + finalPrompt = getTotalFormattedPrompt(getConversationHistory(), rawPrompt); + } // We store raw prompt into message adapter, because we don't want to show the extra // tokens from system prompt - mMessageAdapter.add(new Message(rawPrompt, true, MessageType.TEXT, 0)); + mMessageAdapter.add(new Message(rawPrompt, true, MessageType.TEXT, promptID)); mMessageAdapter.notifyDataSetChanged(); mEditTextMessage.setText(""); - mResultMessage = new Message("", false, MessageType.TEXT, 0); + mResultMessage = new Message("", false, MessageType.TEXT, promptID); mMessageAdapter.add(mResultMessage); // Scroll to bottom of the list mMessagesView.smoothScrollToPosition(mMessageAdapter.getCount() - 1); // After images are added to prompt and chat thread, we clear the imageURI list // Note: This has to be done after imageURIs are no longer needed by LlamaModule mSelectedImageUri = null; + promptID++; Runnable runnable = new Runnable() { @Override public void run() { + Process.setThreadPriority(Process.THREAD_PRIORITY_MORE_FAVORABLE); + ETLogging.getInstance().log("starting runnable generate()"); runOnUiThread( new Runnable() { @Override @@ -597,9 +695,34 @@ public void run() { onModelRunStarted(); } }); - ETLogging.getInstance().log("Running inference.. prompt=" + prompt); long generateStartTime = System.currentTimeMillis(); - mModule.generate(prompt, MainActivity.this); + if (ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType()) + == ModelUtils.VISION_MODEL) { + mModule.generateFromPos( + finalPrompt, + ModelUtils.VISION_MODEL_SEQ_LEN, + startPos, + MainActivity.this, + false); + } else if (mCurrentSettingsFields.getModelType() == ModelType.LLAMA_GUARD_3) { + String llamaGuardPromptForClassification = + PromptFormat.getFormattedLlamaGuardPrompt(rawPrompt); + ETLogging.getInstance() + .log("Running inference.. prompt=" + llamaGuardPromptForClassification); + mModule.generate( + llamaGuardPromptForClassification, + llamaGuardPromptForClassification.length() + 64, + MainActivity.this, + false); + } else { + ETLogging.getInstance().log("Running inference.. prompt=" + finalPrompt); + mModule.generate( + finalPrompt, + (int) (finalPrompt.length() * 0.75) + 64, + MainActivity.this, + false); + } + long generateDuration = System.currentTimeMillis() - generateStartTime; mResultMessage.setTotalGenerationTime(generateDuration); runOnUiThread( @@ -612,7 +735,7 @@ public void run() { ETLogging.getInstance().log("Inference completed"); } }; - new Thread(runnable).start(); + executor.execute(runnable); }); mMessageAdapter.notifyDataSetChanged(); } diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java index d9cbd95a1a7..2538c852e48 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java @@ -16,6 +16,7 @@ import android.widget.ImageView; import android.widget.TextView; import java.util.ArrayList; +import java.util.Collections; public class MessageAdapter extends ArrayAdapter { @@ -90,4 +91,41 @@ public void clear() { public ArrayList getSavedMessages() { return savedMessages; } + + public ArrayList getRecentSavedTextMessages(int numOfLatestPromptMessages) { + ArrayList recentMessages = new ArrayList(); + int lastIndex = savedMessages.size() - 1; + Message messageToAdd = savedMessages.get(lastIndex); + int oldPromptID = messageToAdd.getPromptID(); + + for (int i = 0; i < savedMessages.size(); i++) { + messageToAdd = savedMessages.get(lastIndex - i); + if (messageToAdd.getMessageType() != MessageType.SYSTEM) { + if (messageToAdd.getPromptID() != oldPromptID) { + numOfLatestPromptMessages--; + oldPromptID = messageToAdd.getPromptID(); + } + if (numOfLatestPromptMessages > 0) { + if (messageToAdd.getMessageType() == MessageType.TEXT) { + recentMessages.add(messageToAdd); + } + } else { + break; + } + } + } + + // To place the order in [input1, output1, input2, output2...] + Collections.reverse(recentMessages); + return recentMessages; + } + + public int getMaxPromptID() { + int maxPromptID = -1; + for (Message msg : savedMessages) { + + maxPromptID = Math.max(msg.getPromptID(), maxPromptID); + } + return maxPromptID; + } } diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelType.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelType.java index 91e84be0590..a241ca3d52d 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelType.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelType.java @@ -12,4 +12,5 @@ public enum ModelType { LLAMA_3, LLAMA_3_1, LLAVA_1_5, + LLAMA_GUARD_3, } diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java new file mode 100644 index 00000000000..ab1f1bc92fc --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java @@ -0,0 +1,28 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package com.example.executorchllamademo; + +public class ModelUtils { + static final int TEXT_MODEL = 1; + static final int VISION_MODEL = 2; + static final int VISION_MODEL_IMAGE_CHANNELS = 3; + static final int VISION_MODEL_SEQ_LEN = 768; + static final int TEXT_MODEL_SEQ_LEN = 256; + + public static int getModelCategory(ModelType modelType) { + switch (modelType) { + case LLAVA_1_5: + return VISION_MODEL; + case LLAMA_3: + case LLAMA_3_1: + default: + return TEXT_MODEL; + } + } +} diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java index 72990f4ea8b..14cf38e669d 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java @@ -12,6 +12,8 @@ public class PromptFormat { public static final String SYSTEM_PLACEHOLDER = "{{ system_prompt }}"; public static final String USER_PLACEHOLDER = "{{ user_prompt }}"; + public static final String ASSISTANT_PLACEHOLDER = "{{ assistant_response }}"; + public static final String DEFAULT_SYSTEM_PROMPT = "Answer the questions in a few sentences"; public static String getSystemPromptTemplate(ModelType modelType) { switch (modelType) { @@ -21,6 +23,7 @@ public static String getSystemPromptTemplate(ModelType modelType) { + SYSTEM_PLACEHOLDER + "<|eot_id|>"; case LLAVA_1_5: + return "USER: "; default: return SYSTEM_PLACEHOLDER; } @@ -30,13 +33,85 @@ public static String getUserPromptTemplate(ModelType modelType) { switch (modelType) { case LLAMA_3: case LLAMA_3_1: + case LLAMA_GUARD_3: return "<|start_header_id|>user<|end_header_id|>\n" + USER_PLACEHOLDER - + "<|eot_id|>\n" + + "<|eot_id|>" + "<|start_header_id|>assistant<|end_header_id|>"; + + case LLAVA_1_5: + default: + return USER_PLACEHOLDER; + } + } + + public static String getConversationFormat(ModelType modelType) { + switch (modelType) { + case LLAMA_3: + case LLAMA_3_1: + return getUserPromptTemplate(modelType) + "\n" + ASSISTANT_PLACEHOLDER + "<|eot_id|>"; case LLAVA_1_5: + return USER_PLACEHOLDER + " ASSISTANT:"; default: return USER_PLACEHOLDER; } } + + public static String getStopToken(ModelType modelType) { + switch (modelType) { + case LLAMA_3: + case LLAMA_3_1: + case LLAMA_GUARD_3: + return "<|eot_id|>"; + case LLAVA_1_5: + return ""; + default: + return ""; + } + } + + public static String getLlavaPresetPrompt() { + return "A chat between a curious human and an artificial intelligence assistant. The assistant" + + " gives helpful, detailed, and polite answers to the human's questions. USER: "; + } + + public static String getFormattedLlamaGuardPrompt(String userPrompt) { + return getUserPromptTemplate(ModelType.LLAMA_GUARD_3) + .replace( + USER_PLACEHOLDER, getLlamaGuardPresetPrompt().replace(USER_PLACEHOLDER, userPrompt)); + } + + public static String getLlamaGuardPresetPrompt() { + return "Task: Check if there is unsafe content in 'User' messages in conversations" + + " according our safety policy with the below categories.\n" + + "\n" + + "\n" + + "S1: Violent Crimes. \n" + + "S2: Non-Violent Crimes. \n" + + "S3: Sex Crimes. \n" + + "S4: Child Exploitation. \n" + + "S5: Defamation. \n" + + "S6: Specialized Advice. \n" + + "S7: Privacy. \n" + + "S8: Intellectual Property. \n" + + "S9: Indiscriminate Weapons. \n" + + "S10: Hate. \n" + + "S11: Self-Harm. \n" + + "S12: Sexual Content. \n" + + "S13: Elections. \n" + + "S14: Code Interpreter Abuse.\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "User: " + + USER_PLACEHOLDER + + "\n" + + "\n" + + "Provide your safety assessment for ONLY THE LAST User message in the above" + + " conversation:\n" + + " - First line must read 'safe' or 'unsafe'.\n" + + " - If unsafe, a second line must include a comma-separated list of violated" + + " categories."; + } } diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java index 5f1fc96e1ac..773fef19dd7 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java @@ -10,6 +10,7 @@ import android.app.AlertDialog; import android.content.DialogInterface; +import android.os.Build; import android.os.Bundle; import android.text.Editable; import android.text.TextWatcher; @@ -18,6 +19,7 @@ import android.widget.ImageButton; import android.widget.TextView; import androidx.appcompat.app.AppCompatActivity; +import androidx.core.content.ContextCompat; import androidx.core.graphics.Insets; import androidx.core.view.ViewCompat; import androidx.core.view.WindowInsetsCompat; @@ -43,12 +45,16 @@ public class SettingsActivity extends AppCompatActivity { public SettingsFields mSettingsFields; private DemoSharedPreferences mDemoSharedPreferences; - public static double TEMPERATURE_MIN_VALUE = 0.1; + public static double TEMPERATURE_MIN_VALUE = 0.0; @Override protected void onCreate(Bundle savedInstanceState) { super.onCreate(savedInstanceState); setContentView(R.layout.activity_settings); + if (Build.VERSION.SDK_INT >= 21) { + getWindow().setStatusBarColor(ContextCompat.getColor(this, R.color.status_bar)); + getWindow().setNavigationBarColor(ContextCompat.getColor(this, R.color.nav_bar)); + } ViewCompat.setOnApplyWindowInsetsListener( requireViewById(R.id.main), (v, insets) -> { @@ -120,6 +126,7 @@ private void setupLoadModelButton() { public void onClick(DialogInterface dialog, int whichButton) { mSettingsFields.saveLoadModelAction(true); mLoadModelButton.setEnabled(false); + onBackPressed(); } }) .setNegativeButton(android.R.string.no, null) @@ -208,8 +215,7 @@ public void afterTextChanged(Editable s) { new DialogInterface.OnClickListener() { public void onClick(DialogInterface dialog, int whichButton) { // Clear the messageAdapter and sharedPreference - mSystemPromptEditText.setText( - PromptFormat.getSystemPromptTemplate(mModelType)); + mSystemPromptEditText.setText(PromptFormat.DEFAULT_SYSTEM_PROMPT); } }) .setNegativeButton(android.R.string.no, null) diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java index 466d3303e28..b71799981b2 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java @@ -38,12 +38,12 @@ public String getFormattedSystemAndUserPrompt(String prompt) { return getFormattedSystemPrompt() + getFormattedUserPrompt(prompt); } - private String getFormattedSystemPrompt() { + public String getFormattedSystemPrompt() { return PromptFormat.getSystemPromptTemplate(modelType) .replace(PromptFormat.SYSTEM_PLACEHOLDER, systemPrompt); } - private String getFormattedUserPrompt(String prompt) { + public String getFormattedUserPrompt(String prompt) { return userPrompt.replace(PromptFormat.USER_PLACEHOLDER, prompt); } diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/banner_shape.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/banner_shape.xml index 70f251ee649..0868ffffa6f 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/banner_shape.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/banner_shape.xml @@ -1,7 +1,5 @@ - - + \ No newline at end of file diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_24.xml index 9f83b8fbe79..2ae27b8409e 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_24.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_24.xml @@ -1,4 +1,4 @@ - + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_photo_alternate_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_photo_alternate_24.xml index d710d27110a..7077fedd483 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_photo_alternate_24.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_photo_alternate_24.xml @@ -1,4 +1,4 @@ - + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_article_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_article_24.xml index 30d5d26b985..a6837b9c69f 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_article_24.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_article_24.xml @@ -1,4 +1,5 @@ - + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_close_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_close_24.xml index f8ca0c64b98..fb902d4331b 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_close_24.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_close_24.xml @@ -1,4 +1,5 @@ - + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_delete_forever_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_delete_forever_24.xml index 2c71fc6e568..4680bc6629e 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_delete_forever_24.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_delete_forever_24.xml @@ -1,4 +1,4 @@ - + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_restart_alt_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_restart_alt_24.xml index 9285db079aa..860470ab109 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_restart_alt_24.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_restart_alt_24.xml @@ -1,4 +1,4 @@ - diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_send_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_send_24.xml index 3abc6cb33be..2de1f642089 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_send_24.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_send_24.xml @@ -1,5 +1,6 @@ diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_settings_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_settings_24.xml index 42593b298e9..c51d84b9f4f 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_settings_24.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_settings_24.xml @@ -3,7 +3,8 @@ android:height="24dp" android:viewportWidth="960" android:viewportHeight="960" - android:tint="#000000"> + android:tint="#FFFFFF +"> diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_stop_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_stop_24.xml index 817d57b76a8..832e2585954 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_stop_24.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_stop_24.xml @@ -1,4 +1,5 @@ - + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/chat_background.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/chat_background.xml new file mode 100644 index 00000000000..eb8b9d1f1a9 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/chat_background.xml @@ -0,0 +1,21 @@ + + + + + + + + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/expand_circle_down.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/expand_circle_down.xml new file mode 100644 index 00000000000..0a7a71f0700 --- /dev/null +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/expand_circle_down.xml @@ -0,0 +1,9 @@ + + + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/input_text_shape.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/input_text_shape.xml index 15c404c60df..35c778a437d 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/input_text_shape.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/input_text_shape.xml @@ -1,10 +1,7 @@ - - + - + \ No newline at end of file diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_add_box_48.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_add_box_48.xml index c8b2c96d585..bb45d63d85b 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_add_box_48.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_add_box_48.xml @@ -1,4 +1,5 @@ - + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_arrow_drop_down_circle_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_arrow_drop_down_circle_24.xml deleted file mode 100644 index a8c859d8b36..00000000000 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_arrow_drop_down_circle_24.xml +++ /dev/null @@ -1,5 +0,0 @@ - - - - - diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/prompt_shape.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/prompt_shape.xml index 1627ed98c0d..5f81396e382 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/prompt_shape.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/prompt_shape.xml @@ -1,6 +1,6 @@ - + \ No newline at end of file diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/received_message.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/received_message.xml index ea2d1bbfa14..c2288b5bfce 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/received_message.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/received_message.xml @@ -1,6 +1,6 @@ - + diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml index ec215e63ba1..7b8b8d1760d 100644 --- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml +++ b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml @@ -20,38 +20,32 @@ - - + android:textSize="14sp" /> + android:paddingTop="20dp" + android:src="@drawable/baseline_article_24" /> @@ -83,7 +77,7 @@ android:id="@+id/mediaPreviewConstraintLayout" android:layout_width="match_parent" android:layout_height="wrap_content" - android:background="#edf0ee" + android:background="#16293D" android:visibility="gone"> @@ -169,7 +163,7 @@ + android:text="" + android:textColor="#ffffff" + android:textColorHint="#ffffff" + android:translationY="5dp" /> + + android:textAlignment="viewStart" + android:textColor="#FFFFFF" + android:textSize="22sp" + android:translationX="5dp" + android:translationY="5dp" /> + android:translationX="5dp" /> + android:text="no model selected" + android:textColor="#FFFFFF" /> + android:background="#00FFFFFF" + android:scaleType="center" + android:scaleX="0.7" + android:scaleY="0.7" + android:src="@drawable/expand_circle_down" /> @@ -65,10 +74,12 @@ + android:translationX="5dp" /> + android:text="no tokenizer selected" + android:textColor="#FFFFFF" /> + android:background="#00FFFFFF" + android:scaleX="0.7" + android:scaleY="0.7" + android:src="@drawable/expand_circle_down" /> @@ -97,10 +111,12 @@ + android:translationX="5dp" /> + android:text="no model type selected" + android:textColor="#FFFFFF" /> + android:background="#00FFFFFF" + android:scaleX="0.7" + android:scaleY="0.7" + android:src="@drawable/expand_circle_down" /> +