From 43cfbcb8a9183c701dadc63581f40e653d0fff28 Mon Sep 17 00:00:00 2001
From: winskuo-quic <winskuo@qti.qualcomm.com>
Date: Fri, 19 Sep 2025 16:53:37 +0800
Subject: [PATCH] Qualcomm AI Engine Direct - Support LLM Perplexity Evaluation
 on CI

---
 .ci/scripts/test_qnn_static_llama.sh          |  69 ----------
 .ci/scripts/test_qnn_static_llm.sh            |  94 ++++++++++++++
 .github/workflows/pull.yml                    |  13 +-
 backends/qualcomm/tests/test_qnn_delegate.py  |   6 +-
 .../oss_scripts/llama/decoder_utils.py        | 118 ++++++++++++------
 examples/qualcomm/oss_scripts/llama/llama.py  |   1 -
 6 files changed, 183 insertions(+), 118 deletions(-)
 delete mode 100644 .ci/scripts/test_qnn_static_llama.sh
 create mode 100644 .ci/scripts/test_qnn_static_llm.sh

diff --git a/.ci/scripts/test_qnn_static_llama.sh b/.ci/scripts/test_qnn_static_llama.sh
deleted file mode 100644
index 7898d03b3b9..00000000000
--- a/.ci/scripts/test_qnn_static_llama.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/bin/bash
-# Copyright (c) Qualcomm Innovation Center, Inc.
-# All rights reserved
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-set -euxo pipefail
-
-source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
-
-# Download QNN_SDK. If already downloaded, export environment path
-source "$(dirname "${BASH_SOURCE[0]}")/../../backends/qualcomm/scripts/install_qnn_sdk.sh"
-install_qnn
-
-export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)"
-export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang"
-export PYTHONPATH=".."
-cp schema/program.fbs exir/_serialize/program.fbs
-cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs
-cp -f build-x86/backends/qualcomm/PyQnnManagerAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python
-cp -f build-x86/backends/qualcomm/PyQnnWrapperAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python
-
-if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
-  PYTHON_EXECUTABLE=python3
-fi
-
-which "${PYTHON_EXECUTABLE}"
-
-# Although static llama CI does not require graphviz, it is required by test_qnn_delegate.py
-pip install graphviz
-
-# Download stories llama110m artifacts
-download_stories_model_artifacts
-echo "Creating tokenizer.bin"
-$PYTHON_EXECUTABLE -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
-
-set +e
-# Compile only as weight sharing is not applicable on x86.
-$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir ./stories_110m_pte_size --llama_artifacts . --compile_only
-exit_code1=$?
-
-# Checks accuracy with weight sharing disabled since x86 does not support weight sharing.
-$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./stories_110m_accuracy --llama_artifacts . --enable_x86_64
-exit_code2=$?
-
-# Check BC
-bash backends/qualcomm/bc/test_qnn_static_llama_bc.sh
-exit_code3=$?
-
-# Check the exit codes and print messages
-if [ $exit_code1 -ne 0 ]; then
-    echo "Static Llama compile only with weight sharing test failed. $exit_code1."
-fi
-
-if [ $exit_code2 -ne 0 ]; then
-    echo "Static Llama accuracy test failed. $exit_code2."
-fi
-
-if [ $exit_code3 -ne 0 ]; then
-    echo "Static Llama BACKWARD COMPATIBILITY test failed. $exit_code3."
-fi
-
-# Return failure if either program failed
-if [ $exit_code1 -ne 0 ] || [ $exit_code2 -ne 0 ] || [ $exit_code3 -ne 0 ]; then
-    exit 1
-else
-    exit 0
-fi
diff --git a/.ci/scripts/test_qnn_static_llm.sh b/.ci/scripts/test_qnn_static_llm.sh
new file mode 100644
index 00000000000..9d1c82f12d5
--- /dev/null
+++ b/.ci/scripts/test_qnn_static_llm.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -euxo pipefail
+
+source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
+
+TASK_NAME=$1
+if [[ -z "${TASK_NAME:-}" ]]; then
+  echo "Missing task name, exiting..."
+  exit 1
+fi
+
+
+# Download QNN_SDK. If already downloaded, export environment path
+source "$(dirname "${BASH_SOURCE[0]}")/../../backends/qualcomm/scripts/install_qnn_sdk.sh"
+install_qnn
+
+export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)"
+export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang"
+export PYTHONPATH=".."
+cp schema/program.fbs exir/_serialize/program.fbs
+cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs
+cp -f build-x86/backends/qualcomm/PyQnnManagerAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python
+cp -f build-x86/backends/qualcomm/PyQnnWrapperAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+
+which "${PYTHON_EXECUTABLE}"
+
+# Although static llama CI does not require graphviz, it is required by test_qnn_delegate.py
+pip install graphviz
+
+set +e
+
+echo "Executing task: $TASK_NAME"
+if [[ "${TASK_NAME}" == "stories_110m" ]]; then
+    # Download stories llama110m artifacts
+    download_stories_model_artifacts
+    echo "Creating tokenizer.bin"
+    $PYTHON_EXECUTABLE -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
+
+    # Compile only as weight sharing is not applicable on x86.
+    $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir ./stories_110m_pte_size --llama_artifacts . --compile_only
+    exit_code1=$?
+
+    # Checks accuracy with weight sharing disabled since x86 does not support weight sharing.
+    $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./stories_110m_accuracy --llama_artifacts . --enable_x86_64
+    exit_code2=$?
+
+    # Check the exit codes and print messages
+    if [ $exit_code1 -ne 0 ]; then
+        echo "Static Llama compile only with weight sharing test failed. $exit_code1."
+    fi
+
+    if [ $exit_code2 -ne 0 ]; then
+        echo "Static Llama accuracy test failed. $exit_code2."
+    fi
+
+    if [ $exit_code1 -ne 0 ] || [ $exit_code2 -ne 0 ]; then
+        exit 1
+    else
+        exit 0
+    fi
+
+elif [[ "${TASK_NAME}" == "stories_260k_bc" ]]; then
+
+    # Check BC
+    bash backends/qualcomm/bc/test_qnn_static_llama_bc.sh
+    exit_code1=$?
+    if [ $exit_code1 -ne 0 ]; then
+        exit 1
+    else
+        exit 0
+    fi
+
+elif [[ "${TASK_NAME}" == "smollm2_135m" ]]; then
+    $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_static_smollm2 --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./static_smollm2 --enable_x86_64
+    exit_code1=$?
+    if [ $exit_code1 -ne 0 ]; then
+        exit 1
+    else
+        exit 0
+    fi
+else
+    echo "Unsupported task: $TASK_NAME"
+    exit 1
+fi
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 5b646cba9d1..8395352e1db 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -558,20 +558,22 @@ jobs:
         # Test llama2
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}"
 
-  test-static-llama-qnn-linux:
-    name: test-static-llama-qnn-linux
+  test-static-llm-qnn-linux:
+    name: test-static-llm-qnn-linux
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
       contents: read
     strategy:
+      matrix:
+        task: [stories_110m, stories_260k_bc, smollm2_135m]
       fail-fast: false
     with:
-      runner: linux.2xlarge
+      runner: linux.24xlarge
       docker-image: ci-image:executorch-ubuntu-22.04-qnn-sdk
       submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 180
+      timeout: 900
       script: |
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
@@ -588,8 +590,7 @@ jobs:
         # Setup install_requirements for llama
         PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
 
-        # Test static llama weight sharing and accuracy
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test_qnn_static_llama.sh
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_qnn_static_llm.sh ${{ matrix.task }}
 
   test-qnn-models-linux:
     name: test-qnn-models-linux
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 2641acc5a2d..8eb8f382acd 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -6124,8 +6124,6 @@ def test_static_smollm2(self):
             "kv",
             "--temperature",
             "0",
-            "--prefill_ar_len",
-            "128",
             "--max_seq_len",
             "1024",
             "--eval_perplexity",
@@ -6153,8 +6151,10 @@ def test_static_smollm2(self):
             if "Error" in msg:
                 self.fail(msg["Error"])
             else:
+                print("Perplexity score: ", msg["wiki_ppl"])
                 self.assertLessEqual(msg["wiki_ppl"], 25)
-                self.assertGreaterEqual(msg["inference_speed"], 200)
+                if not self.enable_x86_64:
+                    self.assertGreaterEqual(msg["inference_speed"], 200)
 
     def test_static_smollm3(self):
         if not self.required_envs():
diff --git a/examples/qualcomm/oss_scripts/llama/decoder_utils.py b/examples/qualcomm/oss_scripts/llama/decoder_utils.py
index 6a4d00a5308..bd72ad21ce4 100644
--- a/examples/qualcomm/oss_scripts/llama/decoder_utils.py
+++ b/examples/qualcomm/oss_scripts/llama/decoder_utils.py
@@ -7,11 +7,11 @@
 import getpass
 import logging
 import os
+import subprocess
 from collections import defaultdict, OrderedDict
 from typing import Callable, List, Optional, Tuple, Union
 
 import numpy as np
-
 import torch
 from executorch.backends.qualcomm._passes import SeqMSE
 from executorch.examples.models.llama.evaluate.eager_eval import EagerEvalWrapper
@@ -50,7 +50,7 @@ class GraphModuleCalibrationWrapper(EagerEvalWrapper):
     A wrapper class for calibration
     """
 
-    def __init__(
+    def __init__(  # noqa: C901
         self,
         model: torch.fx.GraphModule,
         tokenizer: Union[
@@ -255,7 +255,7 @@ class QnnRunnerEvalWrapper(EagerEvalWrapper):
     A wrapper class to run PPL scores with QNN on device.
     """
 
-    def __init__(
+    def __init__(  # noqa: C901
         self,
         args,
         pte_path: str,
@@ -263,10 +263,16 @@ def __init__(
             SentencePieceTokenizer, TiktokenTokenizer, HuggingFaceTokenizer
         ],
         runtime_tokenizer_path,
-        max_seq_length: int,
     ):
         self.args = args
         self.pte_path = pte_path
+        self.enable_x86_64 = args.enable_x86_64
+        self.max_seq_length = args.max_seq_len
+
+        if self.enable_x86_64:
+            logging.warning(
+                "Using x86_64 emulator is NOT recommended as it is for CI purpose."
+            )
 
         with open(pte_path, "rb") as f:
             program_data = f.read()
@@ -304,16 +310,15 @@ def __init__(
 
         assert self.output_vocab_size is not None, "Couldn't find the vocab size"
         assert pte_max_seq_len is not None, "Couldn't find the max_seq_len from pte"
-        if pte_max_seq_len != max_seq_length:
+        if pte_max_seq_len != self.max_seq_length:
             logging.warning(
-                f"The pte provided has a max_seq_len {pte_max_seq_len}, which is different from --max_seq_len {max_seq_length} provided to the script, please ensure this is desired."
+                f"The pte provided has a max_seq_len {pte_max_seq_len}, which is different from --max_seq_len {self.max_seq_length} provided to the script, please ensure this is desired."
             )
-            if pte_max_seq_len < max_seq_length:
+            if pte_max_seq_len < self.max_seq_length:
                 logging.warning(
-                    f"The pte max_seq_len {pte_max_seq_len} is used since it is shorter than --max_seq_len {max_seq_length}"
+                    f"The pte max_seq_len {pte_max_seq_len} is used since it is shorter than --max_seq_len {self.max_seq_length}"
                 )
-                max_seq_length = pte_max_seq_len
-        self.max_seq_length = max_seq_length
+                self.max_seq_length = pte_max_seq_len
         self.runtime_tokenizer_path = runtime_tokenizer_path
 
         self.output_dir = args.artifact
@@ -329,10 +334,16 @@ def __init__(
             soc_model=args.model,
             runner="examples/qualcomm/oss_scripts/llama/qnn_llama_runner",
         )
-        self.adb.push(inputs=[], files=[self.runtime_tokenizer_path])
+
+        # collect output data
+        output_data_folder = f"{self.args.artifact}/outputs"
+        make_output_dir(output_data_folder)
+
+        if not self.enable_x86_64:
+            self.adb.push(inputs=[], files=[self.runtime_tokenizer_path])
         # n seq len = n-1 cache len, so we len(inps) = n-1 during _model_call
         # pyre-ignore
-        super().__init__(None, tokenizer, max_seq_length - 1)
+        super().__init__(None, tokenizer, self.max_seq_length - 1)
 
     def _model_call(self, inps):
 
@@ -343,37 +354,13 @@ def _model_call(self, inps):
         outputs_path = "outputs/outputs.txt"
         dump_logits_path = "outputs/all_logit.raw"
         performance_output_path = "outputs/inference_speed.txt"
-        runner_cmd = " ".join(
-            [
-                f"cd {self.workspace} &&",
-                "./qnn_llama_runner",
-                f"--decoder_model_version {DECODER_MODEL_VERSION[self.args.decoder_model]}",
-                f"--tokenizer_path {os.path.basename(self.runtime_tokenizer_path)}",
-                f"--model_path {os.path.basename(self.pte_path)}",
-                f"--seq_len {self.max_seq_length}",
-                f"--output_path {outputs_path}",
-                f"--performance_output_path {performance_output_path}",
-                f"--kv_updater {'SmartMask' if self.args.kv_updater == smart_mask_updater else 'ShiftPointer'}",
-                f"--window {self.args.window}",
-                f"--gcap {self.args.gcap}",
-                f"--ngram {self.args.ngram}",
-                f"--eval_mode {EVAL_MODE[self.args.model_mode]}",
-                "--temperature 0",
-                f"--dump_logits_path {dump_logits_path}",
-                f"--tokenized_prompt {os.path.basename(input_file_name)}",
-            ]
-        )
-
-        self.adb.push(inputs=[], files=[input_file_name], init_env=False)
-        self.adb.execute(custom_runner_cmd=runner_cmd)
-        output_data_folder = f"{self.output_dir}/outputs"
-        make_output_dir(output_data_folder)
         output_tensor_list = []
 
         def post_process():
             with open(f"{self.args.artifact}/{dump_logits_path}", "r") as f:
+                logits_dtype = np.float32 if self.kv_io_bit_width == 32 else np.uint16
                 output_tensor = torch.from_numpy(
-                    np.fromfile(f.name, dtype=np.uint16).reshape(
+                    np.fromfile(f.name, dtype=logits_dtype).reshape(
                         1, -1, self.output_vocab_size
                     )
                 )
@@ -386,7 +373,60 @@ def post_process():
             with open(f"{self.args.artifact}/{performance_output_path}", "r") as f:
                 self.inference_speed = float(f.read())
 
-        self.adb.pull(output_path=self.output_dir, callback=post_process)
+        if self.enable_x86_64:
+            qnn_sdk = os.getenv("QNN_SDK_ROOT")
+            target = "x86_64-linux-clang"
+            runner_cmd = " ".join(
+                [
+                    f"export LD_LIBRARY_PATH={qnn_sdk}/lib/{target}/:{self.args.build_folder}/lib &&",
+                    f"./{self.args.build_folder}/examples/qualcomm/oss_scripts/llama/qnn_llama_runner",
+                    f"--decoder_model_version {DECODER_MODEL_VERSION[self.args.decoder_model]}",
+                    f"--tokenizer_path {self.runtime_tokenizer_path}",
+                    f"--model_path {self.pte_path}",
+                    f"--seq_len {self.max_seq_length}",
+                    f"--output_path {self.args.artifact}/outputs/outputs.txt",
+                    f"--performance_output_path {self.args.artifact}/{performance_output_path}",
+                    f"--eval_mode {EVAL_MODE[self.args.model_mode]}",
+                    "--temperature 0",
+                    "--kv_updater ShiftPointer",
+                    f"--dump_logits_path {self.args.artifact}/{dump_logits_path}",
+                    f"--tokenized_prompt {input_file_name}",
+                ]
+            )
+            subprocess.run(
+                runner_cmd,
+                shell=True,
+                executable="/bin/bash",
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+            )
+            post_process()
+
+        else:
+            runner_cmd = " ".join(
+                [
+                    f"cd {self.workspace} &&",
+                    "./qnn_llama_runner",
+                    f"--decoder_model_version {DECODER_MODEL_VERSION[self.args.decoder_model]}",
+                    f"--tokenizer_path {os.path.basename(self.runtime_tokenizer_path)}",
+                    f"--model_path {os.path.basename(self.pte_path)}",
+                    f"--seq_len {self.max_seq_length}",
+                    f"--output_path {outputs_path}",
+                    f"--performance_output_path {performance_output_path}",
+                    f"--kv_updater {'SmartMask' if self.args.kv_updater == smart_mask_updater else 'ShiftPointer'}",
+                    f"--window {self.args.window}",
+                    f"--gcap {self.args.gcap}",
+                    f"--ngram {self.args.ngram}",
+                    f"--eval_mode {EVAL_MODE[self.args.model_mode]}",
+                    "--temperature 0",
+                    f"--dump_logits_path {dump_logits_path}",
+                    f"--tokenized_prompt {os.path.basename(input_file_name)}",
+                ]
+            )
+
+            self.adb.push(inputs=[], files=[input_file_name], init_env=False)
+            self.adb.execute(custom_runner_cmd=runner_cmd)
+            self.adb.pull(output_path=self.output_dir, callback=post_process)
         return output_tensor_list[0]
 
 
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
index 887e680341f..6f63b921ca3 100755
--- a/examples/qualcomm/oss_scripts/llama/llama.py
+++ b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -892,7 +892,6 @@ def inference(
             pte_path=pte_path,
             tokenizer=tokenizer,
             runtime_tokenizer_path=runtime_tokenizer_path,
-            max_seq_length=args.max_seq_len,
         )
 
         # Evaluate the model