diff --git a/.ci/scripts/test_qnn_static_llama.sh b/.ci/scripts/test_qnn_static_llama.sh new file mode 100644 index 00000000000..8aab21846f1 --- /dev/null +++ b/.ci/scripts/test_qnn_static_llama.sh @@ -0,0 +1,59 @@ +#!/bin/bash +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -exu + +source "$(dirname "${BASH_SOURCE[0]}")/utils.sh" + +export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)" +export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029 +export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang" +export PYTHONPATH=".." +cp schema/program.fbs exir/_serialize/program.fbs +cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs +cp -f build-x86/backends/qualcomm/PyQnnManagerAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python +cp -f build-x86/backends/qualcomm/PyQnnWrapperAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python + +if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then + PYTHON_EXECUTABLE=python3 +fi + +which "${PYTHON_EXECUTABLE}" + +# Although static llama CI does not require graphviz, it is required by test_qnn_delegate.py +pip install graphviz + +# Download stories llama110m artifacts +download_stories_model_artifacts +echo "Creating tokenizer.bin" +$PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin + +set +e +# Compile only as weight sharing is not applicable on x86 +$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir . --compile_only +exit_code1=$? + +# Checks accuracy with weight sharing disabled since x86 does not support weight sharing. +$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --enable_x86_64 +exit_code2=$? + +# Check the exit codes and print messages +if [ $exit_code1 -ne 0 ]; then + echo "Static Llama compile only with weight sharing test failed. $exit_code1." +fi + +if [ $exit_code2 -ne 0 ]; then + echo "Static Llama accuracy test failed. $exit_code2." +fi + +# Return failure if either program failed +if [ $exit_code1 -ne 0 ] || [ $exit_code2 -ne 0 ]; then + exit 1 +else + exit 0 +fi +set -e diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 167e6bbd5a5..a2f65f1a7a9 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -437,6 +437,39 @@ jobs: # Test llama2 PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}" + test-static-llama-qnn-linux: + name: test-static-llama-qnn-linux + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + strategy: + fail-fast: false + with: + runner: linux.2xlarge + docker-image: executorch-ubuntu-22.04-qnn-sdk + submodules: 'true' + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + timeout: 180 + script: | + # The generic Linux job chooses to use base env, not the one setup by the image + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + conda activate "${CONDA_ENV}" + + BUILD_TOOL="cmake" + + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh + PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh + + # Setup executorch + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}" + + # Setup install_requirements for llama + PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh + + # Test static llama weight sharing and accuracy + PYTHON_EXECUTABLE=python bash .ci/scripts/test_qnn_static_llama.sh + test-qnn-models-linux: name: test-qnn-models-linux uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index b10f468a314..decb36c4426 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -1956,6 +1956,7 @@ def test_qnn_backend_multi_graphs(self): soc_model=self.chipset_table[TestQNN.model], backend_options=backend_options, multiple_graphs=True, + weight_sharing=True, graph_name=graph_name, ) for graph_name in graph_names @@ -2519,6 +2520,7 @@ def test_qnn_backend_multi_graphs(self): soc_model=self.chipset_table[TestQNN.model], backend_options=backend_options, multiple_graphs=True, + weight_sharing=True, graph_name=graph_name, ) for graph_name in graph_names @@ -3764,8 +3766,6 @@ def test_stories_single_llama(self): self.artifact_dir, "--build_folder", self.build_folder, - "--device", - self.device, "--model", self.model, "--checkpoint", @@ -3788,9 +3788,21 @@ def test_stories_single_llama(self): "0", "--llama_model", "stories110m", - ] + "--model_mode", + "hybrid", + "--prefill_seq_len", + "32", + "--kv_seq_len", + "128", + ] + if self.compile_only: + cmds.extend(["--compile_only"]) + elif self.device: + cmds.extend(["--device", self.device]) if self.host: cmds.extend(["--host", self.host]) + elif self.enable_x86_64: + cmds.extend(["--enable_x86_64"]) golden_start_with = "Once upon a time," p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) @@ -3801,8 +3813,13 @@ def test_stories_single_llama(self): if "Error" in msg: self.fail(msg["Error"]) else: - model_out = msg["result"][0] - self.assertTrue(model_out.startswith(golden_start_with)) + if not self.compile_only: + model_out = msg["result"][0] + self.assertTrue(model_out.startswith(golden_start_with)) + # x86 does not allow weight sharing, so we don't check pte size + if not self.enable_x86_64: + pte_size = msg["pte_size"] + self.assertLessEqual(pte_size, 130000000) @unittest.skip("dynamic shape inputs appear in recent torch.export.export") def test_mobilebert(self): @@ -4007,12 +4024,6 @@ def setup_environment(): help="Path to open source software model repository", type=str, ) - parser.add_argument( - "-x", - "--enable_x86_64", - help="Enable unittest to be executed on x86_64 platform", - action="store_true", - ) args, ns_args = parser.parse_known_args(namespace=unittest) TestQNN.host = args.host @@ -4031,6 +4042,8 @@ def setup_environment(): TestQNN.shared_buffer = args.shared_buffer TestQNN.enable_x86_64 = args.enable_x86_64 TestQNN.dump_intermediate_outputs = args.dump_intermediate_outputs + TestQNN.compile_only = args.compile_only + return sys.argv[:1] + ns_args diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py index 249c6550bf8..3b5043ce9f9 100644 --- a/backends/qualcomm/tests/utils.py +++ b/backends/qualcomm/tests/utils.py @@ -182,6 +182,7 @@ class TestQNN(unittest.TestCase): use_16a4w: str = "16a4w" shared_buffer: bool = False enable_x86_64: bool = False + compile_only: bool = False def _assert_outputs_equal(self, model_output, ref_output): self.assertTrue(len(ref_output) == len(model_output)) diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py index 79f4e9b13b1..521e84dd39b 100644 --- a/backends/qualcomm/utils/utils.py +++ b/backends/qualcomm/utils/utils.py @@ -1163,6 +1163,7 @@ def generate_qnn_executorch_compiler_spec( shared_buffer: bool = False, is_from_context_binary: bool = False, multiple_graphs: bool = False, + weight_sharing: bool = False, graph_name: str = "forward", ) -> List[CompileSpec]: """ @@ -1193,6 +1194,7 @@ def generate_qnn_executorch_compiler_spec( is_from_context_binary: True if current graph comes from pre-built context binary. multiple_graphs: True if multiple methods are expected to have in single .pte file. Please see test cases for post-processing example. + weight_sharing: Used with multiple_graphs, where model size will be reduced when operations have the same weights across multiple graphs. graph_name: Assign unique graph name if 'multiple_graphs' is used. Returns: @@ -1213,6 +1215,12 @@ def generate_qnn_executorch_compiler_spec( stacklevel=1, ) + if weight_sharing and not multiple_graphs: + warnings.warn( + "Weight sharing is intended for multiple graphs scenario, please ensure if there are multiple graphs", + stacklevel=1, + ) + qnn_executorch_options = QnnExecuTorchOptions( _soc_info_table[soc_model], backend_options ) @@ -1254,7 +1262,10 @@ def generate_qnn_executorch_compiler_spec( if multiple_graphs: # enable weight sharing mechanism if multiple graphs appear - if backend_options.backend_type == QnnExecuTorchBackendType.kHtpBackend: + if ( + backend_options.backend_type == QnnExecuTorchBackendType.kHtpBackend + and weight_sharing + ): backend_options.htp_options.use_weight_sharing = True return [ diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py index f122ba38b5f..ff2c94db943 100755 --- a/examples/qualcomm/oss_scripts/llama/llama.py +++ b/examples/qualcomm/oss_scripts/llama/llama.py @@ -12,6 +12,7 @@ import json import logging import os +import subprocess import sys import time from collections import OrderedDict @@ -654,6 +655,7 @@ def compile(args, pte_filename, tokenizer): backend_options=backend_options, shared_buffer=args.shared_buffer, multiple_graphs=True, + weight_sharing=not args.enable_x86_64, # x86 emulator does not support weight sharing graph_name=graph_name, ) for graph_name in graph_names @@ -790,48 +792,11 @@ def inference(args, quant_attrs, pte_filename, runtime_tokenizer_path, pre_gen_p else: raise RuntimeError(f"Unknown model_mode: {args.model_mode}.") - seq_len = args.prefill_seq_len if args.model_mode == "prefill" else args.kv_seq_len - runner_args = " ".join( - [ - f"--model_path {pte_filename}.pte", - "--output_path outputs/outputs.txt", - f"--tokenizer_path {os.path.basename(runtime_tokenizer_path)}", - f'--prompt "{args.prompt}"', - f"--seq_len {seq_len}", - f"--eval_mode {eval_mode}", - f"--temperature {args.temperature}", - f"--system_prompt '{args.system_prompt}'", - f"--logits_scale {quant_attrs['scale']}", - f"--logits_offset {quant_attrs['zero_point']}", - f"--kv_updator {'SmartMask' if args.kv_updator == smart_mask_updator else 'ShiftPointer'}", - ] - ) - runner_cmd = " ".join( - [ - f"cd {workspace} &&", - f"./qnn_llama_runner {runner_args}", - ] - ) - pte_path = ( f"{pre_gen_pte}/{pte_filename}.pte" if pre_gen_pte else f"{args.artifact}/{pte_filename}.pte" ) - adb = SimpleADB( - qnn_sdk=os.getenv("QNN_SDK_ROOT"), - build_path=f"{args.build_folder}", - pte_path=pte_path, - workspace=workspace, - device_id=args.device, - host_id=args.host, - soc_model=args.model, - shared_buffer=args.shared_buffer, - runner=f"examples/qualcomm/oss_scripts/llama/qnn_llama_runner", - ) - # No pregen inputs, input_list is not required - adb.push(inputs=[], input_list="", files=[runtime_tokenizer_path]) - adb.execute(custom_runner_cmd=runner_cmd) # collect output data output_data_folder = f"{args.artifact}/outputs" @@ -842,14 +807,87 @@ def post_process(): with open(f"{args.artifact}/outputs/outputs.txt", "r") as f: outputs.append(f.read()) - adb.pull(output_path=args.artifact, callback=post_process) + seq_len = args.prefill_seq_len if args.model_mode == "prefill" else args.kv_seq_len + runner_args = " ".join( + [ + f'--prompt "{args.prompt}"', + f"--eval_mode {eval_mode}", + f"--temperature {args.temperature}", + f"--system_prompt '{args.system_prompt}'", + f"--logits_scale {quant_attrs['scale']}", + f"--logits_offset {quant_attrs['zero_point']}", + ] + ) + + runner_cmd = "" + if args.enable_x86_64: + # x86 emulator is intended for CI and not performance. Check only the first few tokens. + seq_len = min(seq_len, 16) + if args.kv_updator == smart_mask_updator: + logging.warning( + "x86 only support ShiftPointer, overwrite kv_updator to ShiftPointer" + ) + + qnn_sdk = os.getenv("QNN_SDK_ROOT") + target = "x86_64-linux-clang" + runner_cmd = " ".join( + [ + f"export LD_LIBRARY_PATH={qnn_sdk}/lib/{target}/:{args.build_folder}/lib &&", + f"./{args.build_folder}/examples/qualcomm/oss_scripts/llama/qnn_llama_runner", + f"--tokenizer_path {runtime_tokenizer_path}", + f"--model_path {pte_path}", + f"--seq_len {seq_len}", + f"--output_path {args.artifact}/outputs/outputs.txt", + f"--kv_updator ShiftPointer", + runner_args, + ] + ) + subprocess.run( + runner_cmd, + shell=True, + executable="/bin/bash", + capture_output=True, + ) + post_process() + else: + runner_cmd = " ".join( + [ + f"cd {workspace} &&", + f"./qnn_llama_runner", + f"--tokenizer_path {os.path.basename(runtime_tokenizer_path)}", + f"--model_path {pte_filename}.pte", + f"--seq_len {seq_len}", + "--output_path outputs/outputs.txt", + f"--kv_updator {'SmartMask' if args.kv_updator == smart_mask_updator else 'ShiftPointer'}", + runner_args, + ] + ) + + adb = SimpleADB( + qnn_sdk=os.getenv("QNN_SDK_ROOT"), + build_path=f"{args.build_folder}", + pte_path=pte_path, + workspace=workspace, + device_id=args.device, + host_id=args.host, + soc_model=args.model, + shared_buffer=args.shared_buffer, + runner=f"examples/qualcomm/oss_scripts/llama/qnn_llama_runner", + ) + # No pregen inputs, input_list is not required + adb.push(inputs=[], input_list="", files=[runtime_tokenizer_path]) + adb.execute(custom_runner_cmd=runner_cmd) + + adb.pull(output_path=args.artifact, callback=post_process) if args.ip and args.port != -1: + pte_size = os.path.getsize(pte_path) with Client((args.ip, args.port)) as conn: conn.send( json.dumps( { "result": outputs, + "pte_size": pte_size, } ) ) @@ -1062,6 +1100,18 @@ def main(args) -> None: ) else: logging.warning("Quant attributes of the logit is None.") + + if args.ip and args.port != -1: + pte_path = f"{args.artifact}/{pte_filename}.pte" + pte_size = os.path.getsize(pte_path) + with Client((args.ip, args.port)) as conn: + conn.send( + json.dumps( + { + "pte_size": pte_size, + } + ) + ) exit(f"Finish compile_only and save to {args.artifact}") try: diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py index 1ba15969e04..784197beb9d 100755 --- a/examples/qualcomm/utils.py +++ b/examples/qualcomm/utils.py @@ -524,6 +524,13 @@ def setup_common_args_and_variables(): default=False, ) + parser.add_argument( + "-x", + "--enable_x86_64", + help="Enable unittest to be executed on x86_64 platform", + action="store_true", + ) + # QNN_SDK_ROOT might also be an argument, but it is used in various places. # So maybe it's fine to just use the environment. if "QNN_SDK_ROOT" not in os.environ: