Skip to content

Commit

Permalink
Optionally ignore utf-8 decoding error when converting std::string to…
Browse files Browse the repository at this point in the history
… python str. (#97282)

Summary:
X-link: pytorch/pytorch#97282

Pull Request resolved: pytorch#2126

When language models use c++ tokenizer, outputs are a c++ strings that are not necessarily valid utf-8 encodings. Default pybind11 casting uses strict utf-8 decoding. We relax the decoding using 'ignore' argument.

Reviewed By: Nayef211

Differential Revision: D43970697

fbshipit-source-id: 37da8270cfd4ae11a43aeb7ab7093edd7d800cee
  • Loading branch information
shuminghu authored and facebook-github-bot committed Mar 21, 2023
1 parent 145479c commit 7957a3d
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 0 deletions.
20 changes: 20 additions & 0 deletions test/torchtext_unittest/test_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -694,6 +694,16 @@ def _gpt2_bpe_decoder_with_special_tokens(self, tokenizer):
for idx, ids in enumerate(sample_ids):
self.assertEqual(tokenizer.decode(ids), expected_texts[idx])

def _gpt_bpe_decoder_partial_utf8(self, tokenizer):
sample_ids = [
['47728', '245', '114'],
['47728', '245', '114', '47728'], # containing partial utf-8 encoding
]
expected_texts = ["𝗶", "𝗶"]

for idx, ids in enumerate(sample_ids):
self.assertEqual(tokenizer.decode(ids), expected_texts[idx])

@nested_params([True, False], [True, False])
def test_gpt2_bpe_tokenizer(self, test_scripting, return_tokens):
"""test tokenization on single sentence input as well as batch on sentences"""
Expand All @@ -704,6 +714,16 @@ def test_gpt2_bpe_decoder(self):
self._gpt2_bpe_decoder(self._load_tokenizer(test_scripting=False, return_tokens=False))
self._gpt2_bpe_decoder_with_special_tokens(self._load_tokenizer(test_scripting=False, return_tokens=False))

torch.ops.torchtext.set_utf8_decoding_ignore(True)
self._gpt_bpe_decoder_partial_utf8(self._load_tokenizer(test_scripting=False, return_tokens=False))
self._gpt_bpe_decoder_partial_utf8(self._load_tokenizer(test_scripting=True, return_tokens=False))

torch.ops.torchtext.set_utf8_decoding_ignore(False)
with self.assertRaises(UnicodeDecodeError):
self._gpt_bpe_decoder_partial_utf8(self._load_tokenizer(test_scripting=True, return_tokens=False))



@nested_params([True, False])
def test_gpt2_bpe_tokenizer_with_added_vocab(self, return_tokens):
self._gpt2_bpe_tokenizer_with_added_vocab(
Expand Down
3 changes: 3 additions & 0 deletions torchtext/csrc/register_pybindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,9 @@ PYBIND11_MODULE(_torchtext, m) {
m.def(
"_build_vocab_from_text_file_using_python_tokenizer",
&_build_vocab_from_text_file_using_python_tokenizer);
m.def(
"torchtext::set_utf8_decoding_ignore",
&torch::jit::setUTF8DecodingIgnore);
}

} // namespace torchtext
3 changes: 3 additions & 0 deletions torchtext/csrc/register_torchbindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,9 @@ TORCH_LIBRARY_FRAGMENT(torchtext, m) {
m.def("torchtext::load_sp_model", &load_sp_model);
m.def("torchtext::load_sp_model_string", &load_sp_model_string);
m.def("torchtext::gpt2_bpe_pre_tokenizer", &gpt2_bpe_pre_tokenizer);
m.def(
"torchtext::set_utf8_decoding_ignore",
&torch::jit::setUTF8DecodingIgnore);
}

} // namespace torchtext
1 change: 1 addition & 0 deletions torchtext/transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,7 @@ def __prepare_scriptable__(self):
return tokenizer_copy
return self

@torch.jit.export
def decode(self, tokens: List[str]) -> str:
"""Return a decoded string given a list of string token ids.
Expand Down

0 comments on commit 7957a3d

Please sign in to comment.