Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions .ci/scripts/test_qnn_static_llama.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/bin/bash
# Copyright (c) Qualcomm Innovation Center, Inc.
# All rights reserved
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

set -exu

source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"

export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)"
export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang"
export PYTHONPATH=".."
cp schema/program.fbs exir/_serialize/program.fbs
cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs
cp -f build-x86/backends/qualcomm/PyQnnManagerAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python
cp -f build-x86/backends/qualcomm/PyQnnWrapperAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python

if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
PYTHON_EXECUTABLE=python3
fi

which "${PYTHON_EXECUTABLE}"

# Although static llama CI does not require graphviz, it is required by test_qnn_delegate.py
pip install graphviz

# Download stories llama110m artifacts
download_stories_model_artifacts
echo "Creating tokenizer.bin"
$PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin

set +e
# Compile only as weight sharing is not applicable on x86
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir . --compile_only
exit_code1=$?

# Checks accuracy with weight sharing disabled since x86 does not support weight sharing.
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --enable_x86_64
exit_code2=$?

# Check the exit codes and print messages
if [ $exit_code1 -ne 0 ]; then
echo "Static Llama compile only with weight sharing test failed. $exit_code1."
fi

if [ $exit_code2 -ne 0 ]; then
echo "Static Llama accuracy test failed. $exit_code2."
fi

# Return failure if either program failed
if [ $exit_code1 -ne 0 ] || [ $exit_code2 -ne 0 ]; then
exit 1
else
exit 0
fi
set -e
33 changes: 33 additions & 0 deletions .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,39 @@ jobs:
# Test llama2
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}"

test-static-llama-qnn-linux:
name: test-static-llama-qnn-linux
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
strategy:
fail-fast: false
with:
runner: linux.2xlarge
docker-image: executorch-ubuntu-22.04-qnn-sdk
submodules: 'true'
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
timeout: 180
script: |
# The generic Linux job chooses to use base env, not the one setup by the image
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
conda activate "${CONDA_ENV}"

BUILD_TOOL="cmake"

PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh

# Setup executorch
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"

# Setup install_requirements for llama
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh

# Test static llama weight sharing and accuracy
PYTHON_EXECUTABLE=python bash .ci/scripts/test_qnn_static_llama.sh

test-qnn-models-linux:
name: test-qnn-models-linux
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
Expand Down
35 changes: 24 additions & 11 deletions backends/qualcomm/tests/test_qnn_delegate.py
Original file line number Diff line number Diff line change
Expand Up @@ -1956,6 +1956,7 @@ def test_qnn_backend_multi_graphs(self):
soc_model=self.chipset_table[TestQNN.model],
backend_options=backend_options,
multiple_graphs=True,
weight_sharing=True,
graph_name=graph_name,
)
for graph_name in graph_names
Expand Down Expand Up @@ -2519,6 +2520,7 @@ def test_qnn_backend_multi_graphs(self):
soc_model=self.chipset_table[TestQNN.model],
backend_options=backend_options,
multiple_graphs=True,
weight_sharing=True,
graph_name=graph_name,
)
for graph_name in graph_names
Expand Down Expand Up @@ -3764,8 +3766,6 @@ def test_stories_single_llama(self):
self.artifact_dir,
"--build_folder",
self.build_folder,
"--device",
self.device,
"--model",
self.model,
"--checkpoint",
Expand All @@ -3788,9 +3788,21 @@ def test_stories_single_llama(self):
"0",
"--llama_model",
"stories110m",
]
"--model_mode",
"hybrid",
"--prefill_seq_len",
"32",
"--kv_seq_len",
"128",
]
if self.compile_only:
cmds.extend(["--compile_only"])
elif self.device:
cmds.extend(["--device", self.device])
if self.host:
cmds.extend(["--host", self.host])
elif self.enable_x86_64:
cmds.extend(["--enable_x86_64"])

golden_start_with = "Once upon a time,"
p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
Expand All @@ -3801,8 +3813,13 @@ def test_stories_single_llama(self):
if "Error" in msg:
self.fail(msg["Error"])
else:
model_out = msg["result"][0]
self.assertTrue(model_out.startswith(golden_start_with))
if not self.compile_only:
model_out = msg["result"][0]
self.assertTrue(model_out.startswith(golden_start_with))
# x86 does not allow weight sharing, so we don't check pte size
if not self.enable_x86_64:
pte_size = msg["pte_size"]
self.assertLessEqual(pte_size, 130000000)

@unittest.skip("dynamic shape inputs appear in recent torch.export.export")
def test_mobilebert(self):
Expand Down Expand Up @@ -4007,12 +4024,6 @@ def setup_environment():
help="Path to open source software model repository",
type=str,
)
parser.add_argument(
"-x",
"--enable_x86_64",
help="Enable unittest to be executed on x86_64 platform",
action="store_true",
)

args, ns_args = parser.parse_known_args(namespace=unittest)
TestQNN.host = args.host
Expand All @@ -4031,6 +4042,8 @@ def setup_environment():
TestQNN.shared_buffer = args.shared_buffer
TestQNN.enable_x86_64 = args.enable_x86_64
TestQNN.dump_intermediate_outputs = args.dump_intermediate_outputs
TestQNN.compile_only = args.compile_only

return sys.argv[:1] + ns_args


Expand Down
1 change: 1 addition & 0 deletions backends/qualcomm/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ class TestQNN(unittest.TestCase):
use_16a4w: str = "16a4w"
shared_buffer: bool = False
enable_x86_64: bool = False
compile_only: bool = False

def _assert_outputs_equal(self, model_output, ref_output):
self.assertTrue(len(ref_output) == len(model_output))
Expand Down
13 changes: 12 additions & 1 deletion backends/qualcomm/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1163,6 +1163,7 @@ def generate_qnn_executorch_compiler_spec(
shared_buffer: bool = False,
is_from_context_binary: bool = False,
multiple_graphs: bool = False,
weight_sharing: bool = False,
graph_name: str = "forward",
) -> List[CompileSpec]:
"""
Expand Down Expand Up @@ -1193,6 +1194,7 @@ def generate_qnn_executorch_compiler_spec(
is_from_context_binary: True if current graph comes from pre-built context binary.
multiple_graphs: True if multiple methods are expected to have in single .pte file.
Please see test cases for post-processing example.
weight_sharing: Used with multiple_graphs, where model size will be reduced when operations have the same weights across multiple graphs.
graph_name: Assign unique graph name if 'multiple_graphs' is used.

Returns:
Expand All @@ -1213,6 +1215,12 @@ def generate_qnn_executorch_compiler_spec(
stacklevel=1,
)

if weight_sharing and not multiple_graphs:
warnings.warn(
"Weight sharing is intended for multiple graphs scenario, please ensure if there are multiple graphs",
stacklevel=1,
)

qnn_executorch_options = QnnExecuTorchOptions(
_soc_info_table[soc_model], backend_options
)
Expand Down Expand Up @@ -1254,7 +1262,10 @@ def generate_qnn_executorch_compiler_spec(

if multiple_graphs:
# enable weight sharing mechanism if multiple graphs appear
if backend_options.backend_type == QnnExecuTorchBackendType.kHtpBackend:
if (
backend_options.backend_type == QnnExecuTorchBackendType.kHtpBackend
and weight_sharing
):
backend_options.htp_options.use_weight_sharing = True

return [
Expand Down
Loading
Loading