Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
172 changes: 0 additions & 172 deletions .ci/scripts/test_llama_lora.sh

This file was deleted.

124 changes: 124 additions & 0 deletions .ci/scripts/test_lora.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#!/bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

set -exu
# shellcheck source=/dev/null
source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"

cmake_install_executorch_libraries() {
echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove this? Seems out dated

rm -rf cmake-out
cmake --workflow llm-release
}

cmake_build_llama_runner() {
echo "Building llama runner"
pushd extension/llm/tokenizers
echo "Updating tokenizers submodule"
git submodule update --init
popd
make llama-cpu
}

cleanup_files() {
echo "Deleting downloaded and generated files"
rm -rf "${HF_QWEN_PATH}/"
rm -rf "${HF_ADAPTER_PATH}/"
rm -rf *.pte *.ptd
rm result*.txt
}

# Hosting lora adapter in personal repo for now.
python -m pip install -q huggingface_hub
HF_ADAPTER_REPO="lucylq/qwen3_06B_lora_math"
HF_ADAPTER_PATH=$(
bash "$(dirname "${BASH_SOURCE[0]}")/download_hf_hub.sh" \
--model_id "${HF_ADAPTER_REPO}" \
--files "adapter_config.json" "adapter_model.safetensors"
)

### SINGLE LORA PTE ###
# Export LoRA PTE file.
$PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
--config examples/models/qwen3/config/qwen3_xnnpack.yaml \
+base.adapter_checkpoint="${HF_ADAPTER_PATH}/adapter_model.safetensors" \
+base.adapter_config="${HF_ADAPTER_PATH}/adapter_config.json" \
+export.output_name="qwen_lora_math_full.pte"

# Capture the path of the downloaded qwen artifacts
HF_QWEN_PATH=$(python -c "from huggingface_hub import snapshot_download; print(snapshot_download('unsloth/Qwen3-0.6B'))")
echo "Model downloaded to: $HF_QWEN_PATH"

### BUILD LLAMA RUNNER.
cmake_install_executorch_libraries
cmake_build_llama_runner

# Runner constants.
RUNTIME_ARGS="--tokenizer_path=${HF_QWEN_PATH}/ --temperature=0 --seq_len=100 --warmup=1"
PROMPT="<|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant"
EXPECTED_PREFIX="
<|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant
To calculate 15% of 80, we can multiply 80 by 0.15.
80 * 0.15 = 12
So, 15% of 80 is 12.
#### 12
The answer is: 12<|im_end|>"

# Run llama runner on single lora PTE file.
NOW=$(date +"%H:%M:%S")
echo "Starting to run llama runner at ${NOW}"
# shellcheck source=/dev/null
cmake-out/examples/models/llama/llama_main --model_path=qwen_lora_math_full.pte --prompt="${PROMPT}" ${RUNTIME_ARGS} > result.txt
NOW=$(date +"%H:%M:%S")
echo "Finished at ${NOW}"

RESULT=$(cat result.txt)
if [[ "${RESULT}" == "${EXPECTED_PREFIX}"* ]]; then
echo "Expected result prefix: ${EXPECTED_PREFIX}"
echo "Actual result: ${RESULT}"
# Do not clean up files if test passes, as they're re-used in the next test.
echo "Success"
else
echo "Expected result prefix: ${EXPECTED_PREFIX}"
echo "Actual result: ${RESULT}"
echo "Failure; results not the same"
cleanup_files
exit 1
fi

### PROGRAM DATA SEPARATION ###
# Export LoRA PTE, LoRA PTD, foundation PTD file.
$PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
--config examples/models/qwen3/config/qwen3_xnnpack.yaml \
+base.adapter_checkpoint="${HF_ADAPTER_PATH}/adapter_model.safetensors" \
+base.adapter_config="${HF_ADAPTER_PATH}/adapter_config.json" \
+export.output_name="qwen_lora_math.pte" \
+export.foundation_weights_file="qwen_foundation.ptd" \
+export.lora_weights_file="qwen_lora_math.ptd"

# Run llama runner on PTE, PTD files.
NOW=$(date +"%H:%M:%S")
echo "Starting to run llama runner at ${NOW}"
# shellcheck source=/dev/null
cmake-out/examples/models/llama/llama_main --model_path=qwen_lora_math.pte --data_paths="qwen_foundation.ptd,qwen_lora_math.ptd" --prompt="${PROMPT}" ${RUNTIME_ARGS} > result2.txt
NOW=$(date +"%H:%M:%S")
echo "Finished at ${NOW}"

RESULT2=$(cat result2.txt)
if [[ "${RESULT2}" == "${EXPECTED_PREFIX}"* ]]; then
echo "Expected result prefix: ${EXPECTED_PREFIX}"
echo "Actual result: ${RESULT2}"
echo "Success"
else
echo "Expected result prefix: ${EXPECTED_PREFIX}"
echo "Actual result: ${RESULT2}"
echo "Failure; results not the same"
cleanup_files
exit 1
fi

cleanup_files
11 changes: 4 additions & 7 deletions .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -728,8 +728,8 @@ jobs:
# run llama runner in eager mode
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama_runner_eager.sh

test-llama-lora-linux:
name: test-llama-lora-linux
test-lora-linux:
name: test-lora-linux
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
Expand All @@ -752,11 +752,8 @@ jobs:
# Install llama requirements
bash examples/models/llama/install_requirements.sh

# install a recent version of torchtune (>= 20250730)
PYTHON_EXECUTABLE=python python -m pip install torchtune==0.7.0.dev20250929 --extra-index-url https://download.pytorch.org/whl/nightly/cpu

# run llama runner in eager mode
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama_lora.sh
PYTHON_EXECUTABLE=python bash .ci/scripts/test_lora.sh

test-mediatek-models-linux:
name: test-mediatek-models-linux
Expand Down Expand Up @@ -863,7 +860,7 @@ jobs:
source .ci/scripts/setup-emscripten.sh

export PNPM_VERSION=10.24.0

curl -fsSL https://get.pnpm.io/install.sh | env PNPM_VERSION=$PNPM_VERSION SHELL="$(which bash)" sh -

export PNPM_HOME="$HOME/.local/share/pnpm"
Expand Down
3 changes: 2 additions & 1 deletion examples/models/qwen3/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,9 @@ With ExecuTorch's sample c++ runner (see the Llama README's [Step 3: Run on your
cmake-out/examples/models/llama/llama_main \
--model_path qwen3_0_6b.pte \
--tokenizer_path ~/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/a9c98e602b9d36d2a2f7ba1eb0f5f31e4e8e5143/tokenizer.json \
--prompt="Who is the president of the US?"
--prompt="<|im_start|>user Who is the president of the US?<|im_end|><|im_start|>assistant"
```
Note that you have to apply the chat template manually for the C++ runner.

To run the model on an example iOS or Android app, see the Llama README's [Step 5: Build Mobile apps](../llama/README.md#step-5-build-mobile-apps) section.

Expand Down
18 changes: 18 additions & 0 deletions examples/models/qwen3/config/qwen3_xnnpack.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
base:
model_class: "qwen3_0_6b"
params: "examples/models/qwen3/config/0_6b_config.json"
metadata: '{"get_bos_id": 151644, "get_eos_ids":[151645]}'

model:
use_kv_cache: True
use_sdpa_with_kv_cache: True
dtype_override: fp32

export:
max_seq_length: 2048
max_context_length: 2048

backend:
xnnpack:
enabled: True
extended_ops: True
Loading