diff --git a/README.md b/README.md index 7644dd3..2b34f30 100644 --- a/README.md +++ b/README.md @@ -217,6 +217,7 @@ Options: --encode, --tokens Output token integers --decode Convert token integers to text --tokens Output full tokens + --allow-special Do not error on special tokens --help Show this message and exit. ``` diff --git a/tests/test_ttok.py b/tests/test_ttok.py index d980ea0..1db5fd6 100644 --- a/tests/test_ttok.py +++ b/tests/test_ttok.py @@ -94,3 +94,16 @@ def test_ttok_file(use_stdin, use_extra_args): result = runner.invoke(cli, args, **kwargs) assert result.exit_code == 0 assert result.output.strip() == str(expected_count) + + +def test_ttok_special_tokens(): + # https://github.com/simonw/ttok/issues/13 + runner = CliRunner() + # Without --allow-special raises an error + result = runner.invoke(cli, ["<|endoftext|>", "--encode"]) + assert result.exit_code != 0 + assert "Use --allow-special to allow special tokens" in result.output + # With --allow-special it works + result = runner.invoke(cli, ["<|endoftext|>", "--encode", "--allow-special"]) + assert result.exit_code == 0 + assert result.output.strip() == "100257" diff --git a/ttok/cli.py b/ttok/cli.py index 693843e..dfd8828 100644 --- a/ttok/cli.py +++ b/ttok/cli.py @@ -19,7 +19,17 @@ "decode_tokens", "--decode", is_flag=True, help="Convert token integers to text" ) @click.option("as_tokens", "--tokens", is_flag=True, help="Output full tokens") -def cli(prompt, input, truncate, model, encode_tokens, decode_tokens, as_tokens): +@click.option("--allow-special", is_flag=True, help="Do not error on special tokens") +def cli( + prompt, + input, + truncate, + model, + encode_tokens, + decode_tokens, + as_tokens, + allow_special, +): """ Count and truncate text based on tokens @@ -57,6 +67,10 @@ def cli(prompt, input, truncate, model, encode_tokens, decode_tokens, as_tokens) """ if decode_tokens and encode_tokens: raise click.ClickException("Cannot use --decode with --encode") + if allow_special and not (encode_tokens or as_tokens): + raise click.ClickException( + "Cannot use --allow-special without --encode or --tokens" + ) if as_tokens and not decode_tokens and not encode_tokens: encode_tokens = True try: @@ -82,7 +96,20 @@ def cli(prompt, input, truncate, model, encode_tokens, decode_tokens, as_tokens) return # Tokenize it - tokens = encoding.encode(text) + kwargs = {} + if allow_special: + kwargs["allowed_special"] = "all" + try: + tokens = encoding.encode(text, **kwargs) + except ValueError as ex: + ex_str = str(ex) + if "disallowed special token" in ex_str and not allow_special: + # Just the first line, then add a hint + ex_str = ( + ex_str.split("\n")[0] + + "\n\nUse --allow-special to allow special tokens" + ) + raise click.ClickException(ex_str) if truncate: tokens = tokens[:truncate]