Optionally ignore utf-8 decoding error when converting std::string to…

… python str. (#97282) Summary: X-link: pytorch/pytorch#97282 Pull Request resolved: pytorch#2126 When language models use c++ tokenizer, outputs are a c++ strings that are not necessarily valid utf-8 encodings. Default pybind11 casting uses strict utf-8 decoding. We relax the decoding using 'ignore' argument. Reviewed By: Nayef211 Differential Revision: D43970697 fbshipit-source-id: 37da8270cfd4ae11a43aeb7ab7093edd7d800cee
shuminghu · Mar 21, 2023 · 7957a3d · 7957a3d
1 parent 145479c
commit 7957a3d
Show file tree

Hide file tree

Showing 4 changed files with 27 additions and 0 deletions.
diff --git a/test/torchtext_unittest/test_transforms.py b/test/torchtext_unittest/test_transforms.py
@@ -694,6 +694,16 @@ def _gpt2_bpe_decoder_with_special_tokens(self, tokenizer):
         for idx, ids in enumerate(sample_ids):
             self.assertEqual(tokenizer.decode(ids), expected_texts[idx])
 
+    def _gpt_bpe_decoder_partial_utf8(self, tokenizer):
+        sample_ids = [
+            ['47728', '245', '114'],
+            ['47728', '245', '114', '47728'],  # containing partial utf-8 encoding
+        ]
+        expected_texts = ["𝗶", "𝗶"]
+
+        for idx, ids in enumerate(sample_ids):
+            self.assertEqual(tokenizer.decode(ids), expected_texts[idx])
+
     @nested_params([True, False], [True, False])
     def test_gpt2_bpe_tokenizer(self, test_scripting, return_tokens):
         """test tokenization on single sentence input as well as batch on sentences"""
@@ -704,6 +714,16 @@ def test_gpt2_bpe_decoder(self):
         self._gpt2_bpe_decoder(self._load_tokenizer(test_scripting=False, return_tokens=False))
         self._gpt2_bpe_decoder_with_special_tokens(self._load_tokenizer(test_scripting=False, return_tokens=False))
 
+        torch.ops.torchtext.set_utf8_decoding_ignore(True)
+        self._gpt_bpe_decoder_partial_utf8(self._load_tokenizer(test_scripting=False, return_tokens=False))
+        self._gpt_bpe_decoder_partial_utf8(self._load_tokenizer(test_scripting=True, return_tokens=False))
+
+        torch.ops.torchtext.set_utf8_decoding_ignore(False)
+        with self.assertRaises(UnicodeDecodeError):
+            self._gpt_bpe_decoder_partial_utf8(self._load_tokenizer(test_scripting=True, return_tokens=False))
+
+
+
     @nested_params([True, False])
     def test_gpt2_bpe_tokenizer_with_added_vocab(self, return_tokens):
         self._gpt2_bpe_tokenizer_with_added_vocab(

diff --git a/torchtext/csrc/register_pybindings.cpp b/torchtext/csrc/register_pybindings.cpp
@@ -287,6 +287,9 @@ PYBIND11_MODULE(_torchtext, m) {
   m.def(
       "_build_vocab_from_text_file_using_python_tokenizer",
       &_build_vocab_from_text_file_using_python_tokenizer);
+  m.def(
+      "torchtext::set_utf8_decoding_ignore",
+      &torch::jit::setUTF8DecodingIgnore);
 }
 
 } // namespace torchtext
diff --git a/torchtext/csrc/register_torchbindings.cpp b/torchtext/csrc/register_torchbindings.cpp
@@ -210,6 +210,9 @@ TORCH_LIBRARY_FRAGMENT(torchtext, m) {
   m.def("torchtext::load_sp_model", &load_sp_model);
   m.def("torchtext::load_sp_model_string", &load_sp_model_string);
   m.def("torchtext::gpt2_bpe_pre_tokenizer", &gpt2_bpe_pre_tokenizer);
+  m.def(
+      "torchtext::set_utf8_decoding_ignore",
+      &torch::jit::setUTF8DecodingIgnore);
 }
 
 } // namespace torchtext
diff --git a/torchtext/transforms.py b/torchtext/transforms.py
@@ -415,6 +415,7 @@ def __prepare_scriptable__(self):
             return tokenizer_copy
         return self
 
+    @torch.jit.export
     def decode(self, tokens: List[str]) -> str:
         """Return a decoded string given a list of string token ids.