From bdc7cd387a11e44acfc1fc3cbb444d8b44da9ad9 Mon Sep 17 00:00:00 2001 From: Lunwen He Date: Mon, 30 Sep 2024 12:22:06 -0700 Subject: [PATCH] buckify eval_llama (#5437) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/5437 This PR buckify `eval_llama`. This is useful when we need to run eval using buck. Reviewed By: mergennachin Differential Revision: D62897016 --- examples/models/llama2/TARGETS | 40 +++++++++++++++++++ examples/models/llama2/eval_llama.py | 2 +- examples/models/llama2/eval_llama_lib.py | 31 +++++++------- examples/models/llama2/evaluate/eager_eval.py | 6 ++- extension/llm/export/builder.py | 4 +- 5 files changed, 64 insertions(+), 19 deletions(-) diff --git a/examples/models/llama2/TARGETS b/examples/models/llama2/TARGETS index eeeb8e5a04c..a2b7776cdf2 100644 --- a/examples/models/llama2/TARGETS +++ b/examples/models/llama2/TARGETS @@ -104,5 +104,45 @@ runtime.python_library( "//executorch/util:python_profiler", "fbsource//third-party/pypi/coremltools:coremltools", "fbsource//third-party/pypi/sentencepiece:sentencepiece", + "//pytorch/ao:torchao", + ], +) + +runtime.python_binary( + name = "eval_llama", + main_function = "executorch.examples.models.llama2.eval_llama.main", + preload_deps = [ + "//executorch/extension/llm/custom_ops:custom_ops_aot_lib", + "//executorch/kernels/quantized:aot_lib", + ], + deps = [ + ":eval_library", + "//caffe2:torch", + ], +) + +runtime.python_library( + name = "eval_library", + srcs = [ + "eval_llama.py", + "eval_llama_lib.py", + "evaluate/eager_eval.py", + ], + _is_external_target = True, + base_module = "executorch.examples.models.llama2", + visibility = [ + "//bento/...", + "//bento_kernels/...", + "//executorch/examples/...", + "@EXECUTORCH_CLIENTS", + ], + deps = [ + "fbsource//third-party/pypi/lm-eval:lm-eval", + "fbsource//third-party/pypi/tiktoken:tiktoken", + ":export_library", + "//executorch/examples/models/llama2/tokenizer:tiktoken_py", + "//executorch/extension/llm/export:export_lib", + "//executorch/extension/llm/tokenizer:tokenizer_py_lib", + "//executorch/extension/pybindings:portable_lib", ], ) diff --git a/examples/models/llama2/eval_llama.py b/examples/models/llama2/eval_llama.py index 4daeaf7afa5..09157789bde 100644 --- a/examples/models/llama2/eval_llama.py +++ b/examples/models/llama2/eval_llama.py @@ -24,7 +24,7 @@ def main() -> None: args = parser.parse_args() # Overrides this arg, because evaluation requires full logits. args.generate_full_logits = True - eval_llama(modelname, args) + eval_llama(modelname, args) # pyre-ignore if __name__ == "__main__": diff --git a/examples/models/llama2/eval_llama_lib.py b/examples/models/llama2/eval_llama_lib.py index b8987ac5d49..3061d290bdc 100644 --- a/examples/models/llama2/eval_llama_lib.py +++ b/examples/models/llama2/eval_llama_lib.py @@ -10,19 +10,20 @@ from typing import Optional, Union import torch -from executorch.examples.models.llama2.evaluate import EagerEvalWrapper, evaluate_model from executorch.examples.models.llama2.export_llama_lib import ( get_quantizer_and_quant_params, ) from executorch.examples.models.llama2.tokenizer.tiktoken import Tokenizer as Tiktoken -from executorch.extension.llm.export import LLMEdgeManager +from executorch.extension.llm.export.builder import LLMEdgeManager from executorch.extension.llm.tokenizer.tokenizer import ( Tokenizer as SentencePieceTokenizer, ) from executorch.extension.llm.tokenizer.utils import get_tokenizer from lm_eval.api.model import LM +from .evaluate.eager_eval import EagerEvalWrapper, evaluate_model + from .export_llama_lib import ( _prepare_for_llama_export, build_args_parser as _build_args_parser, @@ -91,7 +92,7 @@ def __init__( tokenizer: Union[SentencePieceTokenizer, Tiktoken], max_seq_length: Optional[int] = None, ): - super().__init__(None, tokenizer, max_seq_length) + super().__init__(None, tokenizer, max_seq_length) # pyre-ignore self._model = model # Expects model to be path to a .pte file from executorch.extension.pybindings.portable_lib import _load_for_executorch @@ -106,7 +107,7 @@ def __init__( from executorch.kernels import quantized # noqa self._et_model = _load_for_executorch(self._model) - self._use_kv_cache = self._et_model.run_method("use_kv_cache")[0] + self._use_kv_cache = self._et_model.run_method("use_kv_cache")[0] # pyre-ignore def _model_call(self, inps): # Given inps (tokens), return the logits from a single forward call @@ -140,7 +141,7 @@ def __init__( tokenizer_bin: str, max_seq_length: Optional[int] = None, ): - super().__init__(None, tokenizer, max_seq_length) + super().__init__(None, tokenizer, max_seq_length) # pyre-ignore self._model = model self._tokenizer_bin = tokenizer_bin @@ -165,17 +166,17 @@ def gen_eval_wrapper( Returns: eval_wrapper (LM): A wrapper interface for the lm-evaluation-harness library. """ - tokenizer = get_tokenizer(args.tokenizer_path) + tokenizer = get_tokenizer(args.tokenizer_path) # pyre-ignore # ExecuTorch Binary Evaluation - if (model := args.pte) is not None: - if (tokenizer_bin := args.tokenizer_bin) is not None: + if (model := args.pte) is not None: # pyre-ignore + if (tokenizer_bin := args.tokenizer_bin) is not None: # pyre-ignore # ETRunnerEvalWrapper: Create a wrapper around an ExecuTorch model, evaluated at runtime return ETRunnerEvalWrapper( model=model, tokenizer=tokenizer, tokenizer_bin=tokenizer_bin, - max_seq_length=args.max_seq_length, + max_seq_length=args.max_seq_length, # pyre-ignore ) # ETPybindEvalWrapper: Create a wrapper around an ExecuTorch model, evaluated with pybindings @@ -194,7 +195,7 @@ def gen_eval_wrapper( if len(quantizers) != 0: manager = manager.capture_pre_autograd_graph().pt2e_quantize(quantizers) model = ( - manager.pre_autograd_graph_module.to(device="cuda") + manager.pre_autograd_graph_module.to(device="cuda") # pyre-ignore if torch.cuda.is_available() else manager.pre_autograd_graph_module.to(device="cpu") ) @@ -202,8 +203,8 @@ def gen_eval_wrapper( model=model, tokenizer=tokenizer, max_seq_length=args.max_seq_length, - use_kv_cache=args.use_kv_cache, - enable_dynamic_shape=args.enable_dynamic_shape, + use_kv_cache=args.use_kv_cache, # pyre-ignore + enable_dynamic_shape=args.enable_dynamic_shape, # pyre-ignore ) else: # TODO: use manager.pre_autograd_graph_module for the eval to remove the if-else branch @@ -221,7 +222,7 @@ def gen_eval_wrapper( # that is not available in this eval_llama. We save the checkpoint # here for consistency with eval_llama. The accuracy results we # get from eval_llama can be used as a reference to other evaluations. - if args.output_eager_checkpoint_file is not None: + if args.output_eager_checkpoint_file is not None: # pyre-ignore torch.save(model, args.output_eager_checkpoint_file) return EagerEvalWrapper( @@ -282,8 +283,8 @@ def eval_llama( # Evaluate the model eval_results = evaluate_model( eval_wrapper, - args.tasks, - args.limit, + args.tasks, # pyre-ignore + args.limit, # pyre-ignore ) for task, res in eval_results["results"].items(): diff --git a/examples/models/llama2/evaluate/eager_eval.py b/examples/models/llama2/evaluate/eager_eval.py index e8a540f95e2..8f2659ab308 100644 --- a/examples/models/llama2/evaluate/eager_eval.py +++ b/examples/models/llama2/evaluate/eager_eval.py @@ -62,7 +62,7 @@ def batch_size(self): def device(self): return self._device - def tok_encode(self, string: str, **kwargs): + def tok_encode(self, string: str, **kwargs): # pyre-ignore tokens = self._tokenizer.encode(string, bos=True, eos=False) encoded = torch.tensor(tokens, dtype=torch.int, device=self.device) # encoded is a pytorch tensor, but some internal logic in the @@ -111,7 +111,9 @@ def evaluate_model( if "hendrycks_test" in tasks: tasks.remove("hendrycks_test") - tasks += list(lm_eval.tasks.hendrycks_test.create_all_tasks().keys()) + tasks += list( + lm_eval.tasks.hendrycks_test.create_all_tasks().keys() # pyre-ignore + ) task_dict = get_task_dict(tasks) eval_results = evaluate( diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index 91ee2dc733b..4329affbc03 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -209,7 +209,9 @@ def pt2e_calibrate( from executorch.examples.models.llama2.eval_llama_lib import ( GraphModuleEvalWrapper, ) - from executorch.examples.models.llama2.evaluate import evaluate_model + from executorch.examples.models.llama2.evaluate import ( # pyre-ignore[21] + evaluate_model, + ) except ImportError: raise ImportError( "Please install the llm eval dependency via examples/models/llama2/install_requirements.sh"