From 4158d4c87c675267ec88453051ed034ce64405f0 Mon Sep 17 00:00:00 2001 From: Lunwen He Date: Wed, 9 Oct 2024 10:11:56 -0700 Subject: [PATCH] Fix eval (#5955) Summary: This PR fixes a bunch of issues in the eval pipeline: - Use the right token for `eot_token_id` - Do not add `bos` and `eos` during `tok_encode` based on this [discussion](https://fburl.com/code/uifmt746). - Update `executorch/examples/models/llama2/tokenizer/tiktoken.py` to be synced with llama 3.1's official [code](https://github.com/meta-llama/llama-models/blob/main/models/llama3/api/tokenizer.py). Majorly updated set of special tokens. Reviewed By: mergennachin Differential Revision: D62198560 --- examples/models/llama2/evaluate/eager_eval.py | 13 +- examples/models/llama2/tokenizer/tiktoken.py | 117 +++++++----------- 2 files changed, 51 insertions(+), 79 deletions(-) diff --git a/examples/models/llama2/evaluate/eager_eval.py b/examples/models/llama2/evaluate/eager_eval.py index 8f2659ab308..cd845f577ef 100644 --- a/examples/models/llama2/evaluate/eager_eval.py +++ b/examples/models/llama2/evaluate/eager_eval.py @@ -44,7 +44,7 @@ def __init__( @property def eot_token_id(self): - return self._tokenizer.eos_id + return self._tokenizer.eot_id @property def max_length(self): @@ -63,17 +63,10 @@ def device(self): return self._device def tok_encode(self, string: str, **kwargs): # pyre-ignore - tokens = self._tokenizer.encode(string, bos=True, eos=False) - encoded = torch.tensor(tokens, dtype=torch.int, device=self.device) - # encoded is a pytorch tensor, but some internal logic in the - # eval harness expects it to be a list instead - # TODO: verify this for multi-batch as well - encoded = encoded.tolist() - return encoded + return self._tokenizer.encode(string, bos=False, eos=False) def tok_decode(self, tokens): - decoded = self._tokenizer.decode(tokens) - return decoded + return self._tokenizer.decode(tokens) def _model_call(self, inps): if self._use_kv_cache: diff --git a/examples/models/llama2/tokenizer/tiktoken.py b/examples/models/llama2/tokenizer/tiktoken.py index d12b4eb33d2..1d74e5e3aa5 100644 --- a/examples/models/llama2/tokenizer/tiktoken.py +++ b/examples/models/llama2/tokenizer/tiktoken.py @@ -15,32 +15,34 @@ Iterator, List, Literal, + Optional, Sequence, - TypedDict, Union, ) import tiktoken -from tiktoken.load import load_tiktoken_bpe +from tiktoken.load import load_tiktoken_bpe logger = getLogger(__name__) -Role = Literal["system", "user", "assistant"] - +# The tiktoken tokenizer can handle <=400k chars without +# pyo3_runtime.PanicException. +TIKTOKEN_MAX_ENCODE_CHARS = 400_000 -class Message(TypedDict): - role: Role - content: str +# https://github.com/openai/tiktoken/issues/195 +# Here we iterate over subsequences and split if we exceed the limit +# of max consecutive non-whitespace or whitespace characters. +MAX_NO_WHITESPACES_CHARS = 25_000 -Dialog = Sequence[Message] +_INSTANCE = None class Tokenizer: """ - tokenizing and encoding/decoding text using the Tiktoken tokenizer. + Tokenizing and encoding/decoding text using the Tiktoken tokenizer. """ special_tokens: Dict[str, int] @@ -49,6 +51,16 @@ class Tokenizer: pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+" # noqa: E501 + @classmethod + def get_instance(cls): + global _INSTANCE + + if _INSTANCE is None: + _INSTANCE = Tokenizer( + os.path.join(os.path.dirname(__file__), "tokenizer.model") + ) + return _INSTANCE + def __init__(self, model_path: str): """ Initializes the Tokenizer with a Tiktoken model. @@ -56,7 +68,6 @@ def __init__(self, model_path: str): Args: model_path (str): The path to the Tiktoken model file. """ - # reload tokenizer assert os.path.isfile(model_path), model_path mergeable_ranks = load_tiktoken_bpe(model_path) @@ -66,16 +77,21 @@ def __init__(self, model_path: str): "<|end_of_text|>", "<|reserved_special_token_0|>", "<|reserved_special_token_1|>", - "<|reserved_special_token_2|>", - "<|reserved_special_token_3|>", + "<|finetune_right_pad_id|>", + "<|step_id|>", "<|start_header_id|>", "<|end_header_id|>", - "<|reserved_special_token_4|>", + "<|eom_id|>", # end of message "<|eot_id|>", # end of turn - ] + [ - f"<|reserved_special_token_{i}|>" - for i in range(5, self.num_reserved_special_tokens - 5) + "<|python_tag|>", + "<|image|>", + ] + reserved_tokens = [ + f"<|reserved_special_token_{2 + i}|>" + for i in range(self.num_reserved_special_tokens - len(special_tokens)) ] + special_tokens = special_tokens + reserved_tokens + self.special_tokens = { token: num_base_tokens + i for i, token in enumerate(special_tokens) } @@ -85,20 +101,20 @@ def __init__(self, model_path: str): mergeable_ranks=mergeable_ranks, special_tokens=self.special_tokens, ) - logger.info(f"Reloaded SentencePiece model from {model_path}") + self.n_words: int = num_base_tokens + len(special_tokens) # BOS / EOS token IDs - self.n_words: int = self.model.n_vocab self.bos_id: int = self.special_tokens["<|begin_of_text|>"] self.eos_id: int = self.special_tokens["<|end_of_text|>"] - self.pad_id: int = -1 - self.stop_tokens = { - self.special_tokens["<|end_of_text|>"], + self.eot_id: int = self.special_tokens["<|eot_id|>"] + self.eom_id: int = self.special_tokens["<|eom_id|>"] + self.python_tag_id = self.special_tokens["<|python_tag|>"] + self.pad_id: int = self.special_tokens["<|finetune_right_pad_id|>"] + self.stop_tokens = [ + self.eos_id, + self.special_tokens["<|eom_id|>"], self.special_tokens["<|eot_id|>"], - } - logger.info( - f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}" - ) + ] def encode( self, @@ -106,7 +122,7 @@ def encode( *, bos: bool, eos: bool, - allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), # noqa B006 + allowed_special: Optional[Union[Literal["all"], AbstractSet[str]]] = None, disallowed_special: Union[Literal["all"], Collection[str]] = (), ) -> List[int]: """ @@ -125,22 +141,15 @@ def encode( By default, setting disallowed_special=() encodes a string by ignoring special tokens. Specifically: - Setting `disallowed_special` to () will cause all text corresponding - to special tokens to be encoded as natural text (instead of raising + to special tokens to be encoded as natural text (insteading of raising an error). - Setting `allowed_special` to "all" will treat all text corresponding to special tokens to be encoded as special tokens. """ + if allowed_special is None: + allowed_special = set() assert type(s) is str - # The tiktoken tokenizer can handle <=400k chars without - # pyo3_runtime.PanicException (may go beyond 400k) - TIKTOKEN_MAX_ENCODE_CHARS = 400_000 - - # https://github.com/openai/tiktoken/issues/195 - # Here we iterate over subsequences and split if we exceed the limit - # of max consecutive non-whitespace or whitespace characters. - MAX_NO_WHITESPACES_CHARS = 25_000 - substrs = ( substr for i in range(0, len(s), TIKTOKEN_MAX_ENCODE_CHARS) @@ -173,7 +182,7 @@ def decode(self, t: Sequence[int]) -> str: Returns: str: The decoded string. """ - # typecast is safe here, Tiktoken doesn't do anything list-related with the sequence. + # Typecast is safe here. Tiktoken doesn't do anything list-related with the sequence. return self.model.decode(cast(List[int], t)) @staticmethod @@ -181,8 +190,8 @@ def _split_whitespaces_or_nonwhitespaces( s: str, max_consecutive_slice_len: int ) -> Iterator[str]: """ - Split the string `s` so that each substring contains no more than `max_consecutive_slice_len` - consecutive whitespaces or consecutive non-whitespaces + Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len` + consecutive whitespaces or consecutive non-whitespaces. """ current_slice_len = 0 current_slice_is_space = s[0].isspace() if len(s) > 0 else False @@ -201,33 +210,3 @@ def _split_whitespaces_or_nonwhitespaces( slice_start = i current_slice_len = 1 yield s[slice_start:] - - -class ChatFormat: - def __init__(self, tokenizer: Tokenizer): - self.tokenizer = tokenizer - - def encode_header(self, message: Message) -> List[int]: - tokens = [] - tokens.append(self.tokenizer.special_tokens["<|start_header_id|>"]) - tokens.extend(self.tokenizer.encode(message["role"], bos=False, eos=False)) - tokens.append(self.tokenizer.special_tokens["<|end_header_id|>"]) - tokens.extend(self.tokenizer.encode("\n\n", bos=False, eos=False)) - return tokens - - def encode_message(self, message: Message) -> List[int]: - tokens = self.encode_header(message) - tokens.extend( - self.tokenizer.encode(message["content"].strip(), bos=False, eos=False) - ) - tokens.append(self.tokenizer.special_tokens["<|eot_id|>"]) - return tokens - - def encode_dialog_prompt(self, dialog: Dialog) -> List[int]: - tokens = [] - tokens.append(self.tokenizer.special_tokens["<|begin_of_text|>"]) - for message in dialog: - tokens.extend(self.encode_message(message)) - # Add the start of an assistant message for the model to complete - tokens.extend(self.encode_header({"role": "assistant", "content": ""})) - return tokens