From 43cfbcb8a9183c701dadc63581f40e653d0fff28 Mon Sep 17 00:00:00 2001 From: winskuo-quic Date: Fri, 19 Sep 2025 16:53:37 +0800 Subject: [PATCH] Qualcomm AI Engine Direct - Support LLM Perplexity Evaluation on CI --- .ci/scripts/test_qnn_static_llama.sh | 69 ---------- .ci/scripts/test_qnn_static_llm.sh | 94 ++++++++++++++ .github/workflows/pull.yml | 13 +- backends/qualcomm/tests/test_qnn_delegate.py | 6 +- .../oss_scripts/llama/decoder_utils.py | 118 ++++++++++++------ examples/qualcomm/oss_scripts/llama/llama.py | 1 - 6 files changed, 183 insertions(+), 118 deletions(-) delete mode 100644 .ci/scripts/test_qnn_static_llama.sh create mode 100644 .ci/scripts/test_qnn_static_llm.sh diff --git a/.ci/scripts/test_qnn_static_llama.sh b/.ci/scripts/test_qnn_static_llama.sh deleted file mode 100644 index 7898d03b3b9..00000000000 --- a/.ci/scripts/test_qnn_static_llama.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/bash -# Copyright (c) Qualcomm Innovation Center, Inc. -# All rights reserved -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -set -euxo pipefail - -source "$(dirname "${BASH_SOURCE[0]}")/utils.sh" - -# Download QNN_SDK. If already downloaded, export environment path -source "$(dirname "${BASH_SOURCE[0]}")/../../backends/qualcomm/scripts/install_qnn_sdk.sh" -install_qnn - -export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)" -export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang" -export PYTHONPATH=".." -cp schema/program.fbs exir/_serialize/program.fbs -cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs -cp -f build-x86/backends/qualcomm/PyQnnManagerAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python -cp -f build-x86/backends/qualcomm/PyQnnWrapperAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python - -if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then - PYTHON_EXECUTABLE=python3 -fi - -which "${PYTHON_EXECUTABLE}" - -# Although static llama CI does not require graphviz, it is required by test_qnn_delegate.py -pip install graphviz - -# Download stories llama110m artifacts -download_stories_model_artifacts -echo "Creating tokenizer.bin" -$PYTHON_EXECUTABLE -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin - -set +e -# Compile only as weight sharing is not applicable on x86. -$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir ./stories_110m_pte_size --llama_artifacts . --compile_only -exit_code1=$? - -# Checks accuracy with weight sharing disabled since x86 does not support weight sharing. -$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./stories_110m_accuracy --llama_artifacts . --enable_x86_64 -exit_code2=$? - -# Check BC -bash backends/qualcomm/bc/test_qnn_static_llama_bc.sh -exit_code3=$? - -# Check the exit codes and print messages -if [ $exit_code1 -ne 0 ]; then - echo "Static Llama compile only with weight sharing test failed. $exit_code1." -fi - -if [ $exit_code2 -ne 0 ]; then - echo "Static Llama accuracy test failed. $exit_code2." -fi - -if [ $exit_code3 -ne 0 ]; then - echo "Static Llama BACKWARD COMPATIBILITY test failed. $exit_code3." -fi - -# Return failure if either program failed -if [ $exit_code1 -ne 0 ] || [ $exit_code2 -ne 0 ] || [ $exit_code3 -ne 0 ]; then - exit 1 -else - exit 0 -fi diff --git a/.ci/scripts/test_qnn_static_llm.sh b/.ci/scripts/test_qnn_static_llm.sh new file mode 100644 index 00000000000..9d1c82f12d5 --- /dev/null +++ b/.ci/scripts/test_qnn_static_llm.sh @@ -0,0 +1,94 @@ +#!/bin/bash +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -euxo pipefail + +source "$(dirname "${BASH_SOURCE[0]}")/utils.sh" + +TASK_NAME=$1 +if [[ -z "${TASK_NAME:-}" ]]; then + echo "Missing task name, exiting..." + exit 1 +fi + + +# Download QNN_SDK. If already downloaded, export environment path +source "$(dirname "${BASH_SOURCE[0]}")/../../backends/qualcomm/scripts/install_qnn_sdk.sh" +install_qnn + +export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)" +export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang" +export PYTHONPATH=".." +cp schema/program.fbs exir/_serialize/program.fbs +cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs +cp -f build-x86/backends/qualcomm/PyQnnManagerAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python +cp -f build-x86/backends/qualcomm/PyQnnWrapperAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python + +if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then + PYTHON_EXECUTABLE=python3 +fi + +which "${PYTHON_EXECUTABLE}" + +# Although static llama CI does not require graphviz, it is required by test_qnn_delegate.py +pip install graphviz + +set +e + +echo "Executing task: $TASK_NAME" +if [[ "${TASK_NAME}" == "stories_110m" ]]; then + # Download stories llama110m artifacts + download_stories_model_artifacts + echo "Creating tokenizer.bin" + $PYTHON_EXECUTABLE -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin + + # Compile only as weight sharing is not applicable on x86. + $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir ./stories_110m_pte_size --llama_artifacts . --compile_only + exit_code1=$? + + # Checks accuracy with weight sharing disabled since x86 does not support weight sharing. + $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./stories_110m_accuracy --llama_artifacts . --enable_x86_64 + exit_code2=$? + + # Check the exit codes and print messages + if [ $exit_code1 -ne 0 ]; then + echo "Static Llama compile only with weight sharing test failed. $exit_code1." + fi + + if [ $exit_code2 -ne 0 ]; then + echo "Static Llama accuracy test failed. $exit_code2." + fi + + if [ $exit_code1 -ne 0 ] || [ $exit_code2 -ne 0 ]; then + exit 1 + else + exit 0 + fi + +elif [[ "${TASK_NAME}" == "stories_260k_bc" ]]; then + + # Check BC + bash backends/qualcomm/bc/test_qnn_static_llama_bc.sh + exit_code1=$? + if [ $exit_code1 -ne 0 ]; then + exit 1 + else + exit 0 + fi + +elif [[ "${TASK_NAME}" == "smollm2_135m" ]]; then + $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_static_smollm2 --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./static_smollm2 --enable_x86_64 + exit_code1=$? + if [ $exit_code1 -ne 0 ]; then + exit 1 + else + exit 0 + fi +else + echo "Unsupported task: $TASK_NAME" + exit 1 +fi diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 5b646cba9d1..8395352e1db 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -558,20 +558,22 @@ jobs: # Test llama2 PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}" - test-static-llama-qnn-linux: - name: test-static-llama-qnn-linux + test-static-llm-qnn-linux: + name: test-static-llm-qnn-linux uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main permissions: id-token: write contents: read strategy: + matrix: + task: [stories_110m, stories_260k_bc, smollm2_135m] fail-fast: false with: - runner: linux.2xlarge + runner: linux.24xlarge docker-image: ci-image:executorch-ubuntu-22.04-qnn-sdk submodules: 'recursive' ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - timeout: 180 + timeout: 900 script: | # The generic Linux job chooses to use base env, not the one setup by the image CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") @@ -588,8 +590,7 @@ jobs: # Setup install_requirements for llama PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh - # Test static llama weight sharing and accuracy - PYTHON_EXECUTABLE=python bash .ci/scripts/test_qnn_static_llama.sh + PYTHON_EXECUTABLE=python bash .ci/scripts/test_qnn_static_llm.sh ${{ matrix.task }} test-qnn-models-linux: name: test-qnn-models-linux diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index 2641acc5a2d..8eb8f382acd 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -6124,8 +6124,6 @@ def test_static_smollm2(self): "kv", "--temperature", "0", - "--prefill_ar_len", - "128", "--max_seq_len", "1024", "--eval_perplexity", @@ -6153,8 +6151,10 @@ def test_static_smollm2(self): if "Error" in msg: self.fail(msg["Error"]) else: + print("Perplexity score: ", msg["wiki_ppl"]) self.assertLessEqual(msg["wiki_ppl"], 25) - self.assertGreaterEqual(msg["inference_speed"], 200) + if not self.enable_x86_64: + self.assertGreaterEqual(msg["inference_speed"], 200) def test_static_smollm3(self): if not self.required_envs(): diff --git a/examples/qualcomm/oss_scripts/llama/decoder_utils.py b/examples/qualcomm/oss_scripts/llama/decoder_utils.py index 6a4d00a5308..bd72ad21ce4 100644 --- a/examples/qualcomm/oss_scripts/llama/decoder_utils.py +++ b/examples/qualcomm/oss_scripts/llama/decoder_utils.py @@ -7,11 +7,11 @@ import getpass import logging import os +import subprocess from collections import defaultdict, OrderedDict from typing import Callable, List, Optional, Tuple, Union import numpy as np - import torch from executorch.backends.qualcomm._passes import SeqMSE from executorch.examples.models.llama.evaluate.eager_eval import EagerEvalWrapper @@ -50,7 +50,7 @@ class GraphModuleCalibrationWrapper(EagerEvalWrapper): A wrapper class for calibration """ - def __init__( + def __init__( # noqa: C901 self, model: torch.fx.GraphModule, tokenizer: Union[ @@ -255,7 +255,7 @@ class QnnRunnerEvalWrapper(EagerEvalWrapper): A wrapper class to run PPL scores with QNN on device. """ - def __init__( + def __init__( # noqa: C901 self, args, pte_path: str, @@ -263,10 +263,16 @@ def __init__( SentencePieceTokenizer, TiktokenTokenizer, HuggingFaceTokenizer ], runtime_tokenizer_path, - max_seq_length: int, ): self.args = args self.pte_path = pte_path + self.enable_x86_64 = args.enable_x86_64 + self.max_seq_length = args.max_seq_len + + if self.enable_x86_64: + logging.warning( + "Using x86_64 emulator is NOT recommended as it is for CI purpose." + ) with open(pte_path, "rb") as f: program_data = f.read() @@ -304,16 +310,15 @@ def __init__( assert self.output_vocab_size is not None, "Couldn't find the vocab size" assert pte_max_seq_len is not None, "Couldn't find the max_seq_len from pte" - if pte_max_seq_len != max_seq_length: + if pte_max_seq_len != self.max_seq_length: logging.warning( - f"The pte provided has a max_seq_len {pte_max_seq_len}, which is different from --max_seq_len {max_seq_length} provided to the script, please ensure this is desired." + f"The pte provided has a max_seq_len {pte_max_seq_len}, which is different from --max_seq_len {self.max_seq_length} provided to the script, please ensure this is desired." ) - if pte_max_seq_len < max_seq_length: + if pte_max_seq_len < self.max_seq_length: logging.warning( - f"The pte max_seq_len {pte_max_seq_len} is used since it is shorter than --max_seq_len {max_seq_length}" + f"The pte max_seq_len {pte_max_seq_len} is used since it is shorter than --max_seq_len {self.max_seq_length}" ) - max_seq_length = pte_max_seq_len - self.max_seq_length = max_seq_length + self.max_seq_length = pte_max_seq_len self.runtime_tokenizer_path = runtime_tokenizer_path self.output_dir = args.artifact @@ -329,10 +334,16 @@ def __init__( soc_model=args.model, runner="examples/qualcomm/oss_scripts/llama/qnn_llama_runner", ) - self.adb.push(inputs=[], files=[self.runtime_tokenizer_path]) + + # collect output data + output_data_folder = f"{self.args.artifact}/outputs" + make_output_dir(output_data_folder) + + if not self.enable_x86_64: + self.adb.push(inputs=[], files=[self.runtime_tokenizer_path]) # n seq len = n-1 cache len, so we len(inps) = n-1 during _model_call # pyre-ignore - super().__init__(None, tokenizer, max_seq_length - 1) + super().__init__(None, tokenizer, self.max_seq_length - 1) def _model_call(self, inps): @@ -343,37 +354,13 @@ def _model_call(self, inps): outputs_path = "outputs/outputs.txt" dump_logits_path = "outputs/all_logit.raw" performance_output_path = "outputs/inference_speed.txt" - runner_cmd = " ".join( - [ - f"cd {self.workspace} &&", - "./qnn_llama_runner", - f"--decoder_model_version {DECODER_MODEL_VERSION[self.args.decoder_model]}", - f"--tokenizer_path {os.path.basename(self.runtime_tokenizer_path)}", - f"--model_path {os.path.basename(self.pte_path)}", - f"--seq_len {self.max_seq_length}", - f"--output_path {outputs_path}", - f"--performance_output_path {performance_output_path}", - f"--kv_updater {'SmartMask' if self.args.kv_updater == smart_mask_updater else 'ShiftPointer'}", - f"--window {self.args.window}", - f"--gcap {self.args.gcap}", - f"--ngram {self.args.ngram}", - f"--eval_mode {EVAL_MODE[self.args.model_mode]}", - "--temperature 0", - f"--dump_logits_path {dump_logits_path}", - f"--tokenized_prompt {os.path.basename(input_file_name)}", - ] - ) - - self.adb.push(inputs=[], files=[input_file_name], init_env=False) - self.adb.execute(custom_runner_cmd=runner_cmd) - output_data_folder = f"{self.output_dir}/outputs" - make_output_dir(output_data_folder) output_tensor_list = [] def post_process(): with open(f"{self.args.artifact}/{dump_logits_path}", "r") as f: + logits_dtype = np.float32 if self.kv_io_bit_width == 32 else np.uint16 output_tensor = torch.from_numpy( - np.fromfile(f.name, dtype=np.uint16).reshape( + np.fromfile(f.name, dtype=logits_dtype).reshape( 1, -1, self.output_vocab_size ) ) @@ -386,7 +373,60 @@ def post_process(): with open(f"{self.args.artifact}/{performance_output_path}", "r") as f: self.inference_speed = float(f.read()) - self.adb.pull(output_path=self.output_dir, callback=post_process) + if self.enable_x86_64: + qnn_sdk = os.getenv("QNN_SDK_ROOT") + target = "x86_64-linux-clang" + runner_cmd = " ".join( + [ + f"export LD_LIBRARY_PATH={qnn_sdk}/lib/{target}/:{self.args.build_folder}/lib &&", + f"./{self.args.build_folder}/examples/qualcomm/oss_scripts/llama/qnn_llama_runner", + f"--decoder_model_version {DECODER_MODEL_VERSION[self.args.decoder_model]}", + f"--tokenizer_path {self.runtime_tokenizer_path}", + f"--model_path {self.pte_path}", + f"--seq_len {self.max_seq_length}", + f"--output_path {self.args.artifact}/outputs/outputs.txt", + f"--performance_output_path {self.args.artifact}/{performance_output_path}", + f"--eval_mode {EVAL_MODE[self.args.model_mode]}", + "--temperature 0", + "--kv_updater ShiftPointer", + f"--dump_logits_path {self.args.artifact}/{dump_logits_path}", + f"--tokenized_prompt {input_file_name}", + ] + ) + subprocess.run( + runner_cmd, + shell=True, + executable="/bin/bash", + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + post_process() + + else: + runner_cmd = " ".join( + [ + f"cd {self.workspace} &&", + "./qnn_llama_runner", + f"--decoder_model_version {DECODER_MODEL_VERSION[self.args.decoder_model]}", + f"--tokenizer_path {os.path.basename(self.runtime_tokenizer_path)}", + f"--model_path {os.path.basename(self.pte_path)}", + f"--seq_len {self.max_seq_length}", + f"--output_path {outputs_path}", + f"--performance_output_path {performance_output_path}", + f"--kv_updater {'SmartMask' if self.args.kv_updater == smart_mask_updater else 'ShiftPointer'}", + f"--window {self.args.window}", + f"--gcap {self.args.gcap}", + f"--ngram {self.args.ngram}", + f"--eval_mode {EVAL_MODE[self.args.model_mode]}", + "--temperature 0", + f"--dump_logits_path {dump_logits_path}", + f"--tokenized_prompt {os.path.basename(input_file_name)}", + ] + ) + + self.adb.push(inputs=[], files=[input_file_name], init_env=False) + self.adb.execute(custom_runner_cmd=runner_cmd) + self.adb.pull(output_path=self.output_dir, callback=post_process) return output_tensor_list[0] diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py index 887e680341f..6f63b921ca3 100755 --- a/examples/qualcomm/oss_scripts/llama/llama.py +++ b/examples/qualcomm/oss_scripts/llama/llama.py @@ -892,7 +892,6 @@ def inference( pte_path=pte_path, tokenizer=tokenizer, runtime_tokenizer_path=runtime_tokenizer_path, - max_seq_length=args.max_seq_len, ) # Evaluate the model