From 918306d61ba458298069c9feb3faca19bd9221bb Mon Sep 17 00:00:00 2001
From: Lunwen He <lunwenh@meta.com>
Date: Fri, 25 Oct 2024 16:09:09 -0700
Subject: [PATCH] add the ability to run eager runner via buck (#6506)

Summary:

This is helpful when we want to debug in eager mode using buck.

Differential Revision: D64730344
---
 examples/models/llama/TARGETS              |  4 ---
 examples/models/llama/eval_llama_lib.py    |  3 ++-
 examples/models/llama/runner/TARGETS       | 29 ++++++++++++++++++++++
 examples/models/llama/runner/eager.py      |  9 +++----
 examples/models/llama/runner/generation.py |  7 +++---
 examples/models/llama/runner/native.py     |  4 +--
 6 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/examples/models/llama/TARGETS b/examples/models/llama/TARGETS
index 9bd16fa7c07..d328adffbf7 100644
--- a/examples/models/llama/TARGETS
+++ b/examples/models/llama/TARGETS
@@ -126,10 +126,6 @@ runtime.python_library(
 runtime.python_binary(
     name = "eval_llama",
     main_function = "executorch.examples.models.llama.eval_llama.main",
-    preload_deps = [
-        "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
-        "//executorch/kernels/quantized:aot_lib",
-    ],
     deps = [
         ":eval_library",
         "//caffe2:torch",
diff --git a/examples/models/llama/eval_llama_lib.py b/examples/models/llama/eval_llama_lib.py
index 285d2f874df..f0ef5d67589 100644
--- a/examples/models/llama/eval_llama_lib.py
+++ b/examples/models/llama/eval_llama_lib.py
@@ -293,6 +293,7 @@ def eval_llama(
 
     # Needed for loading mmlu dataset.
     # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1998/files
+    # pyre-ignore: Undefined attribute [16]: `argparse.ArgumentParser` has no attribute `tasks`
     if args.tasks and "mmlu" in args.tasks:
         import datasets
 
@@ -302,7 +303,7 @@ def eval_llama(
     with torch.no_grad():
         eval_results = simple_evaluate(
             model=eval_wrapper,
-            tasks=args.tasks,  # pyre-ignore: Undefined attribute [16]: `argparse.ArgumentParser` has no attribute `tasks`
+            tasks=args.tasks,
             num_fewshot=args.num_fewshot,  # pyre-ignore: Undefined attribute [16]: `argparse.ArgumentParser` has no attribute `num_fewshot`
             limit=args.limit,  # pyre-ignore: Undefined attribute [16]: `argparse.ArgumentParser` has no attribute `limit`
         )
diff --git a/examples/models/llama/runner/TARGETS b/examples/models/llama/runner/TARGETS
index 2341af9282f..34cdd62be70 100644
--- a/examples/models/llama/runner/TARGETS
+++ b/examples/models/llama/runner/TARGETS
@@ -1,8 +1,37 @@
 # Any targets that should be shared between fbcode and xplat must be defined in
 # targets.bzl. This file can contain fbcode-only targets.
 
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load(":targets.bzl", "define_common_targets")
 
 oncall("executorch")
 
 define_common_targets()
+
+runtime.python_library(
+    name = "eager_runner_library",
+    srcs = [
+        "eager.py",
+        "generation.py"
+    ],
+    _is_external_target = True,
+    base_module = "executorch.examples.models.llama.runner",
+    visibility = [
+        "//bento/...",
+        "//bento_kernels/...",
+        "//executorch/examples/...",
+        "@EXECUTORCH_CLIENTS",
+    ],
+    deps = [
+        "//executorch/examples/models/llama:export_library",
+    ],
+)
+
+runtime.python_binary(
+    name = "eager",
+    main_function = "executorch.examples.models.llama.runner.eager.main",
+    deps = [
+        ":eager_runner_library",
+        "//caffe2:torch",
+    ],
+)
diff --git a/examples/models/llama/runner/eager.py b/examples/models/llama/runner/eager.py
index e116e08a099..b8792151a09 100644
--- a/examples/models/llama/runner/eager.py
+++ b/examples/models/llama/runner/eager.py
@@ -9,14 +9,13 @@
 from typing import Optional
 
 import torch
-
-from examples.models.llama.llama_transformer import ModelArgs
 from executorch.examples.models.llama.export_llama_lib import (
     _prepare_for_llama_export,
     build_args_parser as _build_args_parser,
 )
+from executorch.examples.models.llama.llama_transformer import ModelArgs
 from executorch.examples.models.llama.runner.generation import LlamaRunner
-from executorch.extension.llm.export import LLMEdgeManager
+from executorch.extension.llm.export.builder import LLMEdgeManager
 
 
 class EagerLlamaRunner(LlamaRunner):
@@ -43,8 +42,8 @@ def __init__(self, args):
 
     def forward(
         self,
-        tokens: Optional[torch.LongTensor] = None,
-        input_pos: Optional[torch.LongTensor] = None,
+        tokens: torch.Tensor,
+        input_pos: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         return self.model.forward(tokens=tokens, input_pos=input_pos)
 
diff --git a/examples/models/llama/runner/generation.py b/examples/models/llama/runner/generation.py
index e332e0ebe2e..867c41aabea 100644
--- a/examples/models/llama/runner/generation.py
+++ b/examples/models/llama/runner/generation.py
@@ -15,7 +15,7 @@
 
 class CompletionPrediction(TypedDict, total=False):
     generation: str
-    tokens: List[str]  # not required
+    tokens: List[int]  # not required
 
 
 def sample_top_p(probs, p):
@@ -47,6 +47,7 @@ def next_token(logits: torch.Tensor, temperature: float, top_p: float) -> int:
     if temperature > 0:
         probs = torch.softmax(logits / temperature, dim=-1)
         return sample_top_p(probs, top_p).item()
+    # Pyre-ignore[7]: Incompatible return type [7]: Expected `int` but got `Union[bool, float, int]`
     return torch.argmax(logits, dim=-1).item()
 
 
@@ -60,8 +61,8 @@ def __init__(self, tokenizer_path: str, model_args: ModelArgs, device: str = "cp
     @abstractmethod
     def forward(
         self,
-        tokens: Optional[torch.LongTensor] = None,
-        input_pos: Optional[torch.LongTensor] = None,
+        tokens: torch.Tensor,
+        input_pos: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         pass
 
diff --git a/examples/models/llama/runner/native.py b/examples/models/llama/runner/native.py
index 90e7fc46dd0..73005d93330 100644
--- a/examples/models/llama/runner/native.py
+++ b/examples/models/llama/runner/native.py
@@ -42,8 +42,8 @@ def __init__(self, args):
 
     def forward(
         self,
-        tokens: Optional[torch.LongTensor] = None,
-        input_pos: Optional[torch.LongTensor] = None,
+        tokens: torch.Tensor,
+        input_pos: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         return (
             self.model.forward((tokens, input_pos))