Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 3 additions & 10 deletions examples/models/llama2/evaluate/eager_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def __init__(

@property
def eot_token_id(self):
return self._tokenizer.eos_id
return self._tokenizer.eot_id

@property
def max_length(self):
Expand All @@ -63,17 +63,10 @@ def device(self):
return self._device

def tok_encode(self, string: str, **kwargs): # pyre-ignore
tokens = self._tokenizer.encode(string, bos=True, eos=False)
encoded = torch.tensor(tokens, dtype=torch.int, device=self.device)
# encoded is a pytorch tensor, but some internal logic in the
# eval harness expects it to be a list instead
# TODO: verify this for multi-batch as well
encoded = encoded.tolist()
return encoded
return self._tokenizer.encode(string, bos=False, eos=False)

def tok_decode(self, tokens):
decoded = self._tokenizer.decode(tokens)
return decoded
return self._tokenizer.decode(tokens)

def _model_call(self, inps):
if self._use_kv_cache:
Expand Down
117 changes: 48 additions & 69 deletions examples/models/llama2/tokenizer/tiktoken.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,32 +15,34 @@
Iterator,
List,
Literal,
Optional,
Sequence,
TypedDict,
Union,
)

import tiktoken
from tiktoken.load import load_tiktoken_bpe

from tiktoken.load import load_tiktoken_bpe

logger = getLogger(__name__)


Role = Literal["system", "user", "assistant"]

# The tiktoken tokenizer can handle <=400k chars without
# pyo3_runtime.PanicException.
TIKTOKEN_MAX_ENCODE_CHARS = 400_000

class Message(TypedDict):
role: Role
content: str
# https://github.com/openai/tiktoken/issues/195
# Here we iterate over subsequences and split if we exceed the limit
# of max consecutive non-whitespace or whitespace characters.
MAX_NO_WHITESPACES_CHARS = 25_000


Dialog = Sequence[Message]
_INSTANCE = None


class Tokenizer:
"""
tokenizing and encoding/decoding text using the Tiktoken tokenizer.
Tokenizing and encoding/decoding text using the Tiktoken tokenizer.
"""

special_tokens: Dict[str, int]
Expand All @@ -49,14 +51,23 @@ class Tokenizer:

pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+" # noqa: E501

@classmethod
def get_instance(cls):
global _INSTANCE

if _INSTANCE is None:
_INSTANCE = Tokenizer(
os.path.join(os.path.dirname(__file__), "tokenizer.model")
)
return _INSTANCE

def __init__(self, model_path: str):
"""
Initializes the Tokenizer with a Tiktoken model.

Args:
model_path (str): The path to the Tiktoken model file.
"""
# reload tokenizer
assert os.path.isfile(model_path), model_path

mergeable_ranks = load_tiktoken_bpe(model_path)
Expand All @@ -66,16 +77,21 @@ def __init__(self, model_path: str):
"<|end_of_text|>",
"<|reserved_special_token_0|>",
"<|reserved_special_token_1|>",
"<|reserved_special_token_2|>",
"<|reserved_special_token_3|>",
"<|finetune_right_pad_id|>",
"<|step_id|>",
"<|start_header_id|>",
"<|end_header_id|>",
"<|reserved_special_token_4|>",
"<|eom_id|>", # end of message
"<|eot_id|>", # end of turn
] + [
f"<|reserved_special_token_{i}|>"
for i in range(5, self.num_reserved_special_tokens - 5)
"<|python_tag|>",
"<|image|>",
]
reserved_tokens = [
f"<|reserved_special_token_{2 + i}|>"
for i in range(self.num_reserved_special_tokens - len(special_tokens))
]
special_tokens = special_tokens + reserved_tokens

self.special_tokens = {
token: num_base_tokens + i for i, token in enumerate(special_tokens)
}
Expand All @@ -85,28 +101,28 @@ def __init__(self, model_path: str):
mergeable_ranks=mergeable_ranks,
special_tokens=self.special_tokens,
)
logger.info(f"Reloaded SentencePiece model from {model_path}")

self.n_words: int = num_base_tokens + len(special_tokens)
# BOS / EOS token IDs
self.n_words: int = self.model.n_vocab
self.bos_id: int = self.special_tokens["<|begin_of_text|>"]
self.eos_id: int = self.special_tokens["<|end_of_text|>"]
self.pad_id: int = -1
self.stop_tokens = {
self.special_tokens["<|end_of_text|>"],
self.eot_id: int = self.special_tokens["<|eot_id|>"]
self.eom_id: int = self.special_tokens["<|eom_id|>"]
self.python_tag_id = self.special_tokens["<|python_tag|>"]
self.pad_id: int = self.special_tokens["<|finetune_right_pad_id|>"]
self.stop_tokens = [
self.eos_id,
self.special_tokens["<|eom_id|>"],
self.special_tokens["<|eot_id|>"],
}
logger.info(
f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
)
]

def encode(
self,
s: str,
*,
bos: bool,
eos: bool,
allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), # noqa B006
allowed_special: Optional[Union[Literal["all"], AbstractSet[str]]] = None,
disallowed_special: Union[Literal["all"], Collection[str]] = (),
) -> List[int]:
"""
Expand All @@ -125,22 +141,15 @@ def encode(
By default, setting disallowed_special=() encodes a string by ignoring
special tokens. Specifically:
- Setting `disallowed_special` to () will cause all text corresponding
to special tokens to be encoded as natural text (instead of raising
to special tokens to be encoded as natural text (insteading of raising
an error).
- Setting `allowed_special` to "all" will treat all text corresponding
to special tokens to be encoded as special tokens.
"""
if allowed_special is None:
allowed_special = set()
assert type(s) is str

# The tiktoken tokenizer can handle <=400k chars without
# pyo3_runtime.PanicException (may go beyond 400k)
TIKTOKEN_MAX_ENCODE_CHARS = 400_000

# https://github.com/openai/tiktoken/issues/195
# Here we iterate over subsequences and split if we exceed the limit
# of max consecutive non-whitespace or whitespace characters.
MAX_NO_WHITESPACES_CHARS = 25_000

substrs = (
substr
for i in range(0, len(s), TIKTOKEN_MAX_ENCODE_CHARS)
Expand Down Expand Up @@ -173,16 +182,16 @@ def decode(self, t: Sequence[int]) -> str:
Returns:
str: The decoded string.
"""
# typecast is safe here, Tiktoken doesn't do anything list-related with the sequence.
# Typecast is safe here. Tiktoken doesn't do anything list-related with the sequence.
return self.model.decode(cast(List[int], t))

@staticmethod
def _split_whitespaces_or_nonwhitespaces(
s: str, max_consecutive_slice_len: int
) -> Iterator[str]:
"""
Split the string `s` so that each substring contains no more than `max_consecutive_slice_len`
consecutive whitespaces or consecutive non-whitespaces
Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
consecutive whitespaces or consecutive non-whitespaces.
"""
current_slice_len = 0
current_slice_is_space = s[0].isspace() if len(s) > 0 else False
Expand All @@ -201,33 +210,3 @@ def _split_whitespaces_or_nonwhitespaces(
slice_start = i
current_slice_len = 1
yield s[slice_start:]


class ChatFormat:
def __init__(self, tokenizer: Tokenizer):
self.tokenizer = tokenizer

def encode_header(self, message: Message) -> List[int]:
tokens = []
tokens.append(self.tokenizer.special_tokens["<|start_header_id|>"])
tokens.extend(self.tokenizer.encode(message["role"], bos=False, eos=False))
tokens.append(self.tokenizer.special_tokens["<|end_header_id|>"])
tokens.extend(self.tokenizer.encode("\n\n", bos=False, eos=False))
return tokens

def encode_message(self, message: Message) -> List[int]:
tokens = self.encode_header(message)
tokens.extend(
self.tokenizer.encode(message["content"].strip(), bos=False, eos=False)
)
tokens.append(self.tokenizer.special_tokens["<|eot_id|>"])
return tokens

def encode_dialog_prompt(self, dialog: Dialog) -> List[int]:
tokens = []
tokens.append(self.tokenizer.special_tokens["<|begin_of_text|>"])
for message in dialog:
tokens.extend(self.encode_message(message))
# Add the start of an assistant message for the model to complete
tokens.extend(self.encode_header({"role": "assistant", "content": ""}))
return tokens
Loading