Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .ci/scripts/test_qnn_static_llama.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ $PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o to

set +e
# Compile only as weight sharing is not applicable on x86
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir . --compile_only
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir . --llama_artifacts . --compile_only
exit_code1=$?

# Checks accuracy with weight sharing disabled since x86 does not support weight sharing.
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --enable_x86_64
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --llama_artifacts . --enable_x86_64
exit_code2=$?

# Check the exit codes and print messages
Expand Down
247 changes: 181 additions & 66 deletions backends/qualcomm/tests/test_qnn_delegate.py
Original file line number Diff line number Diff line change
Expand Up @@ -2991,6 +2991,173 @@ def test_qnn_backend_draw_graph(self):
), "Generated .dot file does not match the golden file."


class TestExampleLLMScript(TestQNN):
def required_envs(self, conditions=None) -> bool:
conditions = [] if conditions is None else conditions
return all(
[
self.executorch_root,
self.artifact_dir,
*conditions,
]
)

def test_llama3_2_1b(self):
if not self.required_envs():
self.skipTest("missing required envs")
assert (
self.llama_artifacts is not None
), "Please provide path to llama artifacts"

prompt = "What is the meaning of life?"
cmds = [
"python",
f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
"--artifact",
self.artifact_dir,
"--build_folder",
self.build_folder,
"--model",
self.model,
"--checkpoint",
f"{self.llama_artifacts}/consolidated.00.pth",
"--params",
f"{self.llama_artifacts}/params.json",
"--tokenizer_model",
f"{self.llama_artifacts}/tokenizer.model",
"--ip",
self.ip,
"--port",
str(self.port),
"--prompt",
f"{prompt}",
"--ptq",
"16a4w",
"--temperature",
"0",
"--llama_model",
"llama3_2",
"--model_mode",
"hybrid",
"--prefill_seq_len",
"32",
"--kv_seq_len",
"512",
"--num_sharding",
"4",
]
if self.compile_only:
cmds.extend(["--compile_only"])
elif self.device:
cmds.extend(["--device", self.device])
if self.host:
cmds.extend(["--host", self.host])
elif self.enable_x86_64:
cmds.extend(["--enable_x86_64"])
if self.pre_gen_pte:
cmds.extend(["--pre_gen_pte", self.pre_gen_pte])

golden_start_with = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>"
p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
with Listener((self.ip, self.port)) as listener:
conn = listener.accept()
p.communicate()
msg = json.loads(conn.recv())
if "Error" in msg:
self.fail(msg["Error"])
else:
if not self.compile_only:
model_out = msg["result"][0]
self.assertTrue(
model_out.startswith(golden_start_with),
f"Expected Output: {golden_start_with}. Actual Output: {model_out}",
)
# x86 does not allow weight sharing, so we don't check pte size.
# Inference speed on x86 is slow, so we only check when running on Android
if not self.enable_x86_64:
pte_size = msg["pte_size"]
self.assertLessEqual(pte_size, 1300000000)
if not self.compile_only and not self.enable_x86_64:
self.assertGreaterEqual(msg["inference_speed"], 66) # Lanai

def test_llama_stories_110m(self):
if not self.required_envs():
self.skipTest("missing required envs")
assert (
self.llama_artifacts is not None
), "Please provide path to llama artifacts"

prompt = "Once"
cmds = [
"python",
f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
"--artifact",
self.artifact_dir,
"--build_folder",
self.build_folder,
"--model",
self.model,
"--checkpoint",
f"{self.llama_artifacts}/stories110M.pt",
"--params",
f"{self.llama_artifacts}/params.json",
"--tokenizer_model",
f"{self.llama_artifacts}/tokenizer.model",
"--tokenizer_bin",
f"{self.llama_artifacts}/tokenizer.bin",
"--ip",
self.ip,
"--port",
str(self.port),
"--prompt",
f"{prompt}",
"--ptq",
"16a4w",
"--temperature",
"0",
"--llama_model",
"stories110m",
"--model_mode",
"hybrid",
"--prefill_seq_len",
"32",
"--kv_seq_len",
"128",
]
if self.compile_only:
cmds.extend(["--compile_only"])
elif self.device:
cmds.extend(["--device", self.device])
if self.host:
cmds.extend(["--host", self.host])
elif self.enable_x86_64:
cmds.extend(["--enable_x86_64"])
if self.pre_gen_pte:
cmds.extend(["--pre_gen_pte", self.pre_gen_pte])

golden_start_with = "Once upon a time,"
p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
with Listener((self.ip, self.port)) as listener:
conn = listener.accept()
p.communicate()
msg = json.loads(conn.recv())
if "Error" in msg:
self.fail(msg["Error"])
else:
if not self.compile_only:
model_out = msg["result"][0]
self.assertTrue(
model_out.startswith(golden_start_with),
f"Expected Output: {golden_start_with}. Actual Output: {model_out}",
)
# x86 does not allow weight sharing, so we don't check pte size
if not self.enable_x86_64:
pte_size = msg["pte_size"]
self.assertLessEqual(pte_size, 130000000)
if not self.compile_only and not self.enable_x86_64:
self.assertGreaterEqual(msg["inference_speed"], 220) # Lanai


class TestExampleOssScript(TestQNN):
def required_envs(self, conditions=None) -> bool:
conditions = [] if conditions is None else conditions
Expand Down Expand Up @@ -3886,72 +4053,6 @@ def test_deeplab_v3(self):
self.assertGreaterEqual(msg["MPA"], 0.70)
self.assertGreaterEqual(msg["MIoU"], 0.55)

def test_stories_single_llama(self):
if not self.required_envs():
self.skipTest("missing required envs")

cmds = [
"python",
f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
"--artifact",
self.artifact_dir,
"--build_folder",
self.build_folder,
"--model",
self.model,
"--checkpoint",
f"{self.artifact_dir}/stories110M.pt",
"--params",
f"{self.artifact_dir}/params.json",
"--tokenizer_model",
f"{self.artifact_dir}/tokenizer.model",
"--tokenizer_bin",
f"{self.artifact_dir}/tokenizer.bin",
"--ip",
self.ip,
"--port",
str(self.port),
"--prompt",
"Once",
"--ptq",
"16a4w",
"--temperature",
"0",
"--llama_model",
"stories110m",
"--model_mode",
"hybrid",
"--prefill_seq_len",
"32",
"--kv_seq_len",
"128",
]
if self.compile_only:
cmds.extend(["--compile_only"])
elif self.device:
cmds.extend(["--device", self.device])
if self.host:
cmds.extend(["--host", self.host])
elif self.enable_x86_64:
cmds.extend(["--enable_x86_64"])

golden_start_with = "Once upon a time,"
p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
with Listener((self.ip, self.port)) as listener:
conn = listener.accept()
p.communicate()
msg = json.loads(conn.recv())
if "Error" in msg:
self.fail(msg["Error"])
else:
if not self.compile_only:
model_out = msg["result"][0]
self.assertTrue(model_out.startswith(golden_start_with))
# x86 does not allow weight sharing, so we don't check pte size
if not self.enable_x86_64:
pte_size = msg["pte_size"]
self.assertLessEqual(pte_size, 130000000)

@unittest.skip("dynamic shape inputs appear in recent torch.export.export")
def test_mobilebert(self):
if not self.required_envs([self.pretrained_weight]):
Expand Down Expand Up @@ -4156,6 +4257,18 @@ def setup_environment():
type=str,
)

parser.add_argument(
"--pre_gen_pte",
help="Run the pre-generated pte in the given directory.",
type=str,
)

parser.add_argument(
"--llama_artifacts",
help="A folder that contains: weight, tokenizer, and params.",
type=str,
)

args, ns_args = parser.parse_known_args(namespace=unittest)
TestQNN.host = args.host
TestQNN.device = args.device
Expand All @@ -4174,6 +4287,8 @@ def setup_environment():
TestQNN.enable_x86_64 = args.enable_x86_64
TestQNN.dump_intermediate_outputs = args.dump_intermediate_outputs
TestQNN.compile_only = args.compile_only
TestQNN.pre_gen_pte = args.pre_gen_pte
TestQNN.llama_artifacts = args.llama_artifacts

return sys.argv[:1] + ns_args

Expand Down
2 changes: 2 additions & 0 deletions backends/qualcomm/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,8 @@ class TestQNN(unittest.TestCase):
shared_buffer: bool = False
enable_x86_64: bool = False
compile_only: bool = False
pre_gen_pte: str = ""
llama_artifacts: str = ""

def _assert_outputs_equal(self, model_output, ref_output):
self.assertTrue(len(ref_output) == len(model_output))
Expand Down
5 changes: 5 additions & 0 deletions examples/qualcomm/oss_scripts/llama/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -881,13 +881,18 @@ def post_process():

adb.pull(output_path=args.artifact, callback=post_process)
if args.ip and args.port != -1:
inference_speed = 0
with open(f"{args.artifact}/outputs/inference_speed.txt", "r") as f:
inference_speed = float(f.read())

pte_size = os.path.getsize(pte_path)
with Client((args.ip, args.port)) as conn:
conn.send(
json.dumps(
{
"result": outputs,
"pte_size": pte_size,
"inference_speed": inference_speed,
}
)
)
Expand Down
14 changes: 14 additions & 0 deletions examples/qualcomm/oss_scripts/llama/runner/runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
#include <executorch/runtime/platform/log.h>
#include <ctime>
#include <fstream>
#include <sstream>

using executorch::aten::Tensor;
Expand Down Expand Up @@ -518,6 +519,19 @@ void printReport(const Runner::Stats& stats) {
stats.num_generated_tokens,
(double)stats.aggregate_sampling_time_ms /
stats.SCALING_FACTOR_UNITS_PER_SECOND);

// For now, we just print the total inference time for CI, can save more info
// in future if needed.
std::ofstream outfile("outputs/inference_speed.txt");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems default to write to this path. If users don't have this path then I assume it will fail? - can we make this runner more generic so it can be reused directly

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I think by running the executable directly without using llama.py, it will fail. I will make a separate PR making this more flexible to users not using python script.

if (outfile.is_open()) {
double num_tok = (stats.num_generated_tokens) /
(double)(stats.inference_end_ms - stats.inference_start_ms) *
stats.SCALING_FACTOR_UNITS_PER_SECOND;
outfile << num_tok;
outfile.close();
} else {
ET_CHECK_MSG(false, "Error saving the inference speed file");
}
}

std::string statsToJsonString(const Runner::Stats& stats) {
Expand Down
Loading