From 4158d4c87c675267ec88453051ed034ce64405f0 Mon Sep 17 00:00:00 2001
From: Lunwen He <lunwenh@meta.com>
Date: Wed, 9 Oct 2024 10:11:56 -0700
Subject: [PATCH] Fix eval (#5955)

Summary:

This PR fixes a bunch of issues in the eval pipeline:
- Use the right token for `eot_token_id`
- Do not add `bos` and `eos` during `tok_encode` based on this [discussion](https://fburl.com/code/uifmt746).
- Update `executorch/examples/models/llama2/tokenizer/tiktoken.py` to be synced with llama 3.1's official [code](https://github.com/meta-llama/llama-models/blob/main/models/llama3/api/tokenizer.py). Majorly updated set of special tokens.

Reviewed By: mergennachin

Differential Revision: D62198560
---
 examples/models/llama2/evaluate/eager_eval.py |  13 +-
 examples/models/llama2/tokenizer/tiktoken.py  | 117 +++++++-----------
 2 files changed, 51 insertions(+), 79 deletions(-)

diff --git a/examples/models/llama2/evaluate/eager_eval.py b/examples/models/llama2/evaluate/eager_eval.py
index 8f2659ab308..cd845f577ef 100644
--- a/examples/models/llama2/evaluate/eager_eval.py
+++ b/examples/models/llama2/evaluate/eager_eval.py
@@ -44,7 +44,7 @@ def __init__(
 
     @property
     def eot_token_id(self):
-        return self._tokenizer.eos_id
+        return self._tokenizer.eot_id
 
     @property
     def max_length(self):
@@ -63,17 +63,10 @@ def device(self):
         return self._device
 
     def tok_encode(self, string: str, **kwargs):  # pyre-ignore
-        tokens = self._tokenizer.encode(string, bos=True, eos=False)
-        encoded = torch.tensor(tokens, dtype=torch.int, device=self.device)
-        # encoded is a pytorch tensor, but some internal logic in the
-        # eval harness expects it to be a list instead
-        # TODO: verify this for multi-batch as well
-        encoded = encoded.tolist()
-        return encoded
+        return self._tokenizer.encode(string, bos=False, eos=False)
 
     def tok_decode(self, tokens):
-        decoded = self._tokenizer.decode(tokens)
-        return decoded
+        return self._tokenizer.decode(tokens)
 
     def _model_call(self, inps):
         if self._use_kv_cache:
diff --git a/examples/models/llama2/tokenizer/tiktoken.py b/examples/models/llama2/tokenizer/tiktoken.py
index d12b4eb33d2..1d74e5e3aa5 100644
--- a/examples/models/llama2/tokenizer/tiktoken.py
+++ b/examples/models/llama2/tokenizer/tiktoken.py
@@ -15,32 +15,34 @@
     Iterator,
     List,
     Literal,
+    Optional,
     Sequence,
-    TypedDict,
     Union,
 )
 
 import tiktoken
-from tiktoken.load import load_tiktoken_bpe
 
+from tiktoken.load import load_tiktoken_bpe
 
 logger = getLogger(__name__)
 
 
-Role = Literal["system", "user", "assistant"]
-
+# The tiktoken tokenizer can handle <=400k chars without
+# pyo3_runtime.PanicException.
+TIKTOKEN_MAX_ENCODE_CHARS = 400_000
 
-class Message(TypedDict):
-    role: Role
-    content: str
+# https://github.com/openai/tiktoken/issues/195
+# Here we iterate over subsequences and split if we exceed the limit
+# of max consecutive non-whitespace or whitespace characters.
+MAX_NO_WHITESPACES_CHARS = 25_000
 
 
-Dialog = Sequence[Message]
+_INSTANCE = None
 
 
 class Tokenizer:
     """
-    tokenizing and encoding/decoding text using the Tiktoken tokenizer.
+    Tokenizing and encoding/decoding text using the Tiktoken tokenizer.
     """
 
     special_tokens: Dict[str, int]
@@ -49,6 +51,16 @@ class Tokenizer:
 
     pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"  # noqa: E501
 
+    @classmethod
+    def get_instance(cls):
+        global _INSTANCE
+
+        if _INSTANCE is None:
+            _INSTANCE = Tokenizer(
+                os.path.join(os.path.dirname(__file__), "tokenizer.model")
+            )
+        return _INSTANCE
+
     def __init__(self, model_path: str):
         """
         Initializes the Tokenizer with a Tiktoken model.
@@ -56,7 +68,6 @@ def __init__(self, model_path: str):
         Args:
             model_path (str): The path to the Tiktoken model file.
         """
-        # reload tokenizer
         assert os.path.isfile(model_path), model_path
 
         mergeable_ranks = load_tiktoken_bpe(model_path)
@@ -66,16 +77,21 @@ def __init__(self, model_path: str):
             "<|end_of_text|>",
             "<|reserved_special_token_0|>",
             "<|reserved_special_token_1|>",
-            "<|reserved_special_token_2|>",
-            "<|reserved_special_token_3|>",
+            "<|finetune_right_pad_id|>",
+            "<|step_id|>",
             "<|start_header_id|>",
             "<|end_header_id|>",
-            "<|reserved_special_token_4|>",
+            "<|eom_id|>",  # end of message
             "<|eot_id|>",  # end of turn
-        ] + [
-            f"<|reserved_special_token_{i}|>"
-            for i in range(5, self.num_reserved_special_tokens - 5)
+            "<|python_tag|>",
+            "<|image|>",
+        ]
+        reserved_tokens = [
+            f"<|reserved_special_token_{2 + i}|>"
+            for i in range(self.num_reserved_special_tokens - len(special_tokens))
         ]
+        special_tokens = special_tokens + reserved_tokens
+
         self.special_tokens = {
             token: num_base_tokens + i for i, token in enumerate(special_tokens)
         }
@@ -85,20 +101,20 @@ def __init__(self, model_path: str):
             mergeable_ranks=mergeable_ranks,
             special_tokens=self.special_tokens,
         )
-        logger.info(f"Reloaded SentencePiece model from {model_path}")
 
+        self.n_words: int = num_base_tokens + len(special_tokens)
         # BOS / EOS token IDs
-        self.n_words: int = self.model.n_vocab
         self.bos_id: int = self.special_tokens["<|begin_of_text|>"]
         self.eos_id: int = self.special_tokens["<|end_of_text|>"]
-        self.pad_id: int = -1
-        self.stop_tokens = {
-            self.special_tokens["<|end_of_text|>"],
+        self.eot_id: int = self.special_tokens["<|eot_id|>"]
+        self.eom_id: int = self.special_tokens["<|eom_id|>"]
+        self.python_tag_id = self.special_tokens["<|python_tag|>"]
+        self.pad_id: int = self.special_tokens["<|finetune_right_pad_id|>"]
+        self.stop_tokens = [
+            self.eos_id,
+            self.special_tokens["<|eom_id|>"],
             self.special_tokens["<|eot_id|>"],
-        }
-        logger.info(
-            f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
-        )
+        ]
 
     def encode(
         self,
@@ -106,7 +122,7 @@ def encode(
         *,
         bos: bool,
         eos: bool,
-        allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),  # noqa B006
+        allowed_special: Optional[Union[Literal["all"], AbstractSet[str]]] = None,
         disallowed_special: Union[Literal["all"], Collection[str]] = (),
     ) -> List[int]:
         """
@@ -125,22 +141,15 @@ def encode(
         By default, setting disallowed_special=() encodes a string by ignoring
         special tokens. Specifically:
         - Setting `disallowed_special` to () will cause all text corresponding
-          to special tokens to be encoded as natural text (instead of raising
+          to special tokens to be encoded as natural text (insteading of raising
           an error).
         - Setting `allowed_special` to "all" will treat all text corresponding
           to special tokens to be encoded as special tokens.
         """
+        if allowed_special is None:
+            allowed_special = set()
         assert type(s) is str
 
-        # The tiktoken tokenizer can handle <=400k chars without
-        # pyo3_runtime.PanicException (may go beyond 400k)
-        TIKTOKEN_MAX_ENCODE_CHARS = 400_000
-
-        # https://github.com/openai/tiktoken/issues/195
-        # Here we iterate over subsequences and split if we exceed the limit
-        # of max consecutive non-whitespace or whitespace characters.
-        MAX_NO_WHITESPACES_CHARS = 25_000
-
         substrs = (
             substr
             for i in range(0, len(s), TIKTOKEN_MAX_ENCODE_CHARS)
@@ -173,7 +182,7 @@ def decode(self, t: Sequence[int]) -> str:
         Returns:
             str: The decoded string.
         """
-        # typecast is safe here, Tiktoken doesn't do anything list-related with the sequence.
+        # Typecast is safe here. Tiktoken doesn't do anything list-related with the sequence.
         return self.model.decode(cast(List[int], t))
 
     @staticmethod
@@ -181,8 +190,8 @@ def _split_whitespaces_or_nonwhitespaces(
         s: str, max_consecutive_slice_len: int
     ) -> Iterator[str]:
         """
-        Split the string `s` so that each substring contains no more than `max_consecutive_slice_len`
-        consecutive whitespaces or consecutive non-whitespaces
+        Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
+        consecutive whitespaces or consecutive non-whitespaces.
         """
         current_slice_len = 0
         current_slice_is_space = s[0].isspace() if len(s) > 0 else False
@@ -201,33 +210,3 @@ def _split_whitespaces_or_nonwhitespaces(
                     slice_start = i
                     current_slice_len = 1
         yield s[slice_start:]
-
-
-class ChatFormat:
-    def __init__(self, tokenizer: Tokenizer):
-        self.tokenizer = tokenizer
-
-    def encode_header(self, message: Message) -> List[int]:
-        tokens = []
-        tokens.append(self.tokenizer.special_tokens["<|start_header_id|>"])
-        tokens.extend(self.tokenizer.encode(message["role"], bos=False, eos=False))
-        tokens.append(self.tokenizer.special_tokens["<|end_header_id|>"])
-        tokens.extend(self.tokenizer.encode("\n\n", bos=False, eos=False))
-        return tokens
-
-    def encode_message(self, message: Message) -> List[int]:
-        tokens = self.encode_header(message)
-        tokens.extend(
-            self.tokenizer.encode(message["content"].strip(), bos=False, eos=False)
-        )
-        tokens.append(self.tokenizer.special_tokens["<|eot_id|>"])
-        return tokens
-
-    def encode_dialog_prompt(self, dialog: Dialog) -> List[int]:
-        tokens = []
-        tokens.append(self.tokenizer.special_tokens["<|begin_of_text|>"])
-        for message in dialog:
-            tokens.extend(self.encode_message(message))
-        # Add the start of an assistant message for the model to complete
-        tokens.extend(self.encode_header({"role": "assistant", "content": ""}))
-        return tokens