diff --git a/examples/models/llama2/eval_llama_lib.py b/examples/models/llama2/eval_llama_lib.py index 5e23a250026..d44820e1873 100644 --- a/examples/models/llama2/eval_llama_lib.py +++ b/examples/models/llama2/eval_llama_lib.py @@ -20,7 +20,7 @@ from executorch.extension.llm.tokenizer.tokenizer import ( Tokenizer as SentencePieceTokenizer, ) - +from executorch.extension.llm.tokenizer.utils import get_tokenizer from lm_eval.api.model import LM from .export_llama_lib import ( @@ -103,11 +103,7 @@ def gen_eval_wrapper( Returns: eval_wrapper (LM): A wrapper interface for the lm-evaluation-harness library. """ - try: - tokenizer = SentencePieceTokenizer(model_path=str(args.tokenizer_path)) - except Exception: - print("Using Tiktokenizer") - tokenizer = Tiktoken(model_path=str(args.tokenizer_path)) + tokenizer = get_tokenizer(args.tokenizer_path) # ExecuTorch Binary Evaluation if (model := args.pte) is not None: diff --git a/extension/llm/tokenizer/targets.bzl b/extension/llm/tokenizer/targets.bzl index 023968165bb..8229bced89e 100644 --- a/extension/llm/tokenizer/targets.bzl +++ b/extension/llm/tokenizer/targets.bzl @@ -25,6 +25,27 @@ def define_common_targets(): ], ) + runtime.python_library( + name = "utils", + srcs = [ + "utils.py", + ], + base_module = "executorch.extension.llm.utils", + visibility = [ + "//executorch/examples/...", + "//executorch/extension/llm/tokenizer/...", + "//bento/...", + "//bento_kernels/...", + ], + deps = [ + "//executorch/examples/models/llama2/tokenizer:tiktoken", + ], + _is_external_target = True, + external_deps = [ + "sentencepiece-py", + ], + ) + runtime.python_binary( name = "tokenizer_py", main_module = "executorch.extension.llm.tokenizer.tokenizer", diff --git a/extension/llm/tokenizer/utils.py b/extension/llm/tokenizer/utils.py new file mode 100644 index 00000000000..97aa4bf0c02 --- /dev/null +++ b/extension/llm/tokenizer/utils.py @@ -0,0 +1,20 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.examples.models.llama2.tokenizer.tiktoken`. +from executorch.examples.models.llama2.tokenizer.tiktoken import Tokenizer as Tiktoken +from executorch.extension.llm.tokenizer.tokenizer import ( + Tokenizer as SentencePieceTokenizer, +) + + +def get_tokenizer(tokenizer_path): + try: + tokenizer = SentencePieceTokenizer(model_path=str(tokenizer_path)) + except Exception: + print("Using Tiktokenizer") + tokenizer = Tiktoken(model_path=str(tokenizer_path)) + return tokenizer