From 35707c1ba243e1c5c353f78d679766350ac09645 Mon Sep 17 00:00:00 2001
From: Lunwen He <lwhecser@gmail.com>
Date: Thu, 17 Oct 2024 15:17:11 -0700
Subject: [PATCH 1/4] Update

[ghstack-poisoned]
---
 .ci/scripts/test_llama_runner_eager.sh     | 62 ++++++++++++++++++++++
 .github/workflows/pull.yml                 | 27 ++++++++++
 examples/models/llama/runner/eager.py      |  4 +-
 examples/models/llama/runner/generation.py |  8 +--
 4 files changed, 96 insertions(+), 5 deletions(-)
 create mode 100644 .ci/scripts/test_llama_runner_eager.sh

diff --git a/.ci/scripts/test_llama_runner_eager.sh b/.ci/scripts/test_llama_runner_eager.sh
new file mode 100644
index 00000000000..af109aee397
--- /dev/null
+++ b/.ci/scripts/test_llama_runner_eager.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+    PYTHON_EXECUTABLE=python3
+fi
+
+# Download and prepare stories model artifacts
+prepare_model_artifacts() {
+    echo "Preparing stories model artifacts"
+    wget -O stories110M.pt "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt"
+    wget -O tokenizer.model "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model"
+    echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
+}
+
+run_and_verify() {
+    NOW=$(date +"%H:%M:%S")
+    echo "Starting to run eval_llama at ${NOW}"
+    if [[ ! -f "stories110M.pt" ]]; then
+        echo "stories110M.pt is missing."
+        exit 1
+    fi
+    if [[ ! -f "tokenizer.model" ]]; then
+        echo "tokenizer.model is missing."
+        exit 1
+    fi
+    if [[ ! -f "params.json" ]]; then
+        echo "params.json is missing."
+        exit 1
+    fi
+    $PYTHON_EXECUTABLE -m examples.models.llama.runner.eager \
+	-c stories110M.pt \
+	-p params.json \
+	-t tokenizer.model \
+	-kv \
+	-d fp32 \
+	--max_seq_length 32 \
+	--temperature 0 \
+	--prompt "Once upon a time," > result.txt
+
+    # Verify result.txt
+    RESULT=$(cat result.txt)
+    EXPECTED_RESULT="there was a little girl"
+    if [[ "${RESULT}" == "${EXPECTED_TASK}"*]]; then
+        echo "Actual result: ${RESULT}"
+        echo "Success"
+        exit 0
+    else
+        echo "Actual result: ${RESULT}"
+        echo "Failure; results not the same"
+        exit 1
+    fi
+}
+
+prepare_model_artifacts
+run_and_verify
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 6ea94f3c5d2..d1ce4e6ac87 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -501,3 +501,30 @@ jobs:
 
         # run eval_llama mmlu task
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_eval_llama_mmlu.sh
+
+  test-llama_runner_eager-linux:
+    name: test-llama_runner_eager-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    strategy:
+      fail-fast: false
+    with:
+      runner: linux.24xlarge
+      docker-image: executorch-ubuntu-22.04-clang12
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
+
+        # install pybind
+        bash install_requirements.sh --pybind xnnpack
+
+        # install llama requirements
+        bash examples/models/llama/install_requirements.sh
+
+        # run llama runner in eager mode
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama_runner_eager.sh
diff --git a/examples/models/llama/runner/eager.py b/examples/models/llama/runner/eager.py
index 42d11bdedfa..cff5c4f8023 100644
--- a/examples/models/llama/runner/eager.py
+++ b/examples/models/llama/runner/eager.py
@@ -11,11 +11,11 @@
 import torch
 
 from examples.models.llama.llama_transformer import ModelArgs
-from executorch.examples.models.llama2.export_llama_lib import (
+from executorch.examples.models.llama.export_llama_lib import (
     _prepare_for_llama_export,
     build_args_parser as _build_args_parser,
 )
-from executorch.examples.models.llama2.runner.generation import LlamaRunner
+from executorch.examples.models.llama.runner.generation import LlamaRunner
 from executorch.extension.llm.export import LLMEdgeManager
 
 
diff --git a/examples/models/llama/runner/generation.py b/examples/models/llama/runner/generation.py
index 885249f9b9a..3f7937cd5a8 100644
--- a/examples/models/llama/runner/generation.py
+++ b/examples/models/llama/runner/generation.py
@@ -10,7 +10,7 @@
 import torch
 
 from executorch.examples.models.llama.llama_transformer import ModelArgs
-from executorch.examples.models.llama.tokenizer.tiktoken import Tokenizer
+from executorch.extension.llm.tokenizer.utils import get_tokenizer
 
 
 class CompletionPrediction(TypedDict, total=False):
@@ -53,7 +53,7 @@ def next_token(logits: torch.Tensor, temperature: float, top_p: float) -> int:
 class LlamaRunner(ABC):
     def __init__(self, tokenizer_path: str, model_args: ModelArgs):
         self.params = model_args
-        self.tokenizer = Tokenizer(tokenizer_path)
+        self.tokenizer = get_tokenizer(tokenizer_path)
         assert model_args.vocab_size == self.tokenizer.n_words
 
     @abstractmethod
@@ -93,7 +93,9 @@ def generate(  # noqa: C901
             else:
                 logits = self.forward(tokens=torch.tensor([tokens], dtype=torch.long))
             current_token = next_token(logits, temperature, top_p)
-            if current_token in self.tokenizer.stop_tokens:
+            if current_token == self.tokenizer.eos_id or (
+                hasattr(self, "stop_tokens") and current_token in self.stop_tokens
+            ):
                 break
             tokens.append(current_token)
 

From 8553e48dc65a0e99f1439b76dd13a8bc3954aa90 Mon Sep 17 00:00:00 2001
From: Lunwen He <lwhecser@gmail.com>
Date: Fri, 18 Oct 2024 09:58:07 -0700
Subject: [PATCH 2/4] Update

[ghstack-poisoned]
---
 .ci/scripts/test_llama_runner_eager.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/scripts/test_llama_runner_eager.sh b/.ci/scripts/test_llama_runner_eager.sh
index af109aee397..4decdeb5609 100644
--- a/.ci/scripts/test_llama_runner_eager.sh
+++ b/.ci/scripts/test_llama_runner_eager.sh
@@ -47,7 +47,7 @@ run_and_verify() {
     # Verify result.txt
     RESULT=$(cat result.txt)
     EXPECTED_RESULT="there was a little girl"
-    if [[ "${RESULT}" == "${EXPECTED_TASK}"*]]; then
+    if [[ "${RESULT}" == "${EXPECTED_TASK}"* ]]; then
         echo "Actual result: ${RESULT}"
         echo "Success"
         exit 0

From e126666a4134c7c87ecc17ccc70cbefda3b2db0b Mon Sep 17 00:00:00 2001
From: Lunwen He <lwhecser@gmail.com>
Date: Fri, 18 Oct 2024 10:11:52 -0700
Subject: [PATCH 3/4] Update

[ghstack-poisoned]
---
 .ci/scripts/test_llama_runner_eager.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/scripts/test_llama_runner_eager.sh b/.ci/scripts/test_llama_runner_eager.sh
index 4decdeb5609..ec2f30b7d27 100644
--- a/.ci/scripts/test_llama_runner_eager.sh
+++ b/.ci/scripts/test_llama_runner_eager.sh
@@ -47,7 +47,7 @@ run_and_verify() {
     # Verify result.txt
     RESULT=$(cat result.txt)
     EXPECTED_RESULT="there was a little girl"
-    if [[ "${RESULT}" == "${EXPECTED_TASK}"* ]]; then
+    if [[ "${RESULT}" == "${EXPECTED_RESULT}"* ]]; then
         echo "Actual result: ${RESULT}"
         echo "Success"
         exit 0

From e06f3329ac64e1d9c31dce07a334e4ba3ca37971 Mon Sep 17 00:00:00 2001
From: Lunwen He <lwhecser@gmail.com>
Date: Fri, 18 Oct 2024 10:29:52 -0700
Subject: [PATCH 4/4] Update

[ghstack-poisoned]
---
 .ci/scripts/test_llama_runner_eager.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/scripts/test_llama_runner_eager.sh b/.ci/scripts/test_llama_runner_eager.sh
index ec2f30b7d27..537d835ba1c 100644
--- a/.ci/scripts/test_llama_runner_eager.sh
+++ b/.ci/scripts/test_llama_runner_eager.sh
@@ -47,7 +47,7 @@ run_and_verify() {
     # Verify result.txt
     RESULT=$(cat result.txt)
     EXPECTED_RESULT="there was a little girl"
-    if [[ "${RESULT}" == "${EXPECTED_RESULT}"* ]]; then
+    if [[ "${RESULT}" == *"${EXPECTED_RESULT}"* ]]; then
         echo "Actual result: ${RESULT}"
         echo "Success"
         exit 0