Skip to content

Commit

Permalink
--allow-special option, closes #13
Browse files Browse the repository at this point in the history
  • Loading branch information
simonw committed May 2, 2024
1 parent b1b0aae commit 7081f94
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 2 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ Options:
--encode, --tokens Output token integers
--decode Convert token integers to text
--tokens Output full tokens
--allow-special Do not error on special tokens
--help Show this message and exit.
```
Expand Down
13 changes: 13 additions & 0 deletions tests/test_ttok.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,16 @@ def test_ttok_file(use_stdin, use_extra_args):
result = runner.invoke(cli, args, **kwargs)
assert result.exit_code == 0
assert result.output.strip() == str(expected_count)


def test_ttok_special_tokens():
# https://github.com/simonw/ttok/issues/13
runner = CliRunner()
# Without --allow-special raises an error
result = runner.invoke(cli, ["<|endoftext|>", "--encode"])
assert result.exit_code != 0
assert "Use --allow-special to allow special tokens" in result.output
# With --allow-special it works
result = runner.invoke(cli, ["<|endoftext|>", "--encode", "--allow-special"])
assert result.exit_code == 0
assert result.output.strip() == "100257"
31 changes: 29 additions & 2 deletions ttok/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,17 @@
"decode_tokens", "--decode", is_flag=True, help="Convert token integers to text"
)
@click.option("as_tokens", "--tokens", is_flag=True, help="Output full tokens")
def cli(prompt, input, truncate, model, encode_tokens, decode_tokens, as_tokens):
@click.option("--allow-special", is_flag=True, help="Do not error on special tokens")
def cli(
prompt,
input,
truncate,
model,
encode_tokens,
decode_tokens,
as_tokens,
allow_special,
):
"""
Count and truncate text based on tokens
Expand Down Expand Up @@ -57,6 +67,10 @@ def cli(prompt, input, truncate, model, encode_tokens, decode_tokens, as_tokens)
"""
if decode_tokens and encode_tokens:
raise click.ClickException("Cannot use --decode with --encode")
if allow_special and not (encode_tokens or as_tokens):
raise click.ClickException(
"Cannot use --allow-special without --encode or --tokens"
)
if as_tokens and not decode_tokens and not encode_tokens:
encode_tokens = True
try:
Expand All @@ -82,7 +96,20 @@ def cli(prompt, input, truncate, model, encode_tokens, decode_tokens, as_tokens)
return

# Tokenize it
tokens = encoding.encode(text)
kwargs = {}
if allow_special:
kwargs["allowed_special"] = "all"
try:
tokens = encoding.encode(text, **kwargs)
except ValueError as ex:
ex_str = str(ex)
if "disallowed special token" in ex_str and not allow_special:
# Just the first line, then add a hint
ex_str = (
ex_str.split("\n")[0]
+ "\n\nUse --allow-special to allow special tokens"
)
raise click.ClickException(ex_str)
if truncate:
tokens = tokens[:truncate]

Expand Down

0 comments on commit 7081f94

Please sign in to comment.