Skip to content

Commit

Permalink
bpo-24214: Fixed the UTF-8 incremental decoder. (GH-12603) (GH-12627)
Browse files Browse the repository at this point in the history
The bug occurred when the encoded surrogate character is passed
to the incremental decoder in two chunks.
(cherry picked from commit 7a465cb)

Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
  • Loading branch information
miss-islington and serhiy-storchaka committed Mar 30, 2019
1 parent 4724ba9 commit bd48280
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 0 deletions.
9 changes: 9 additions & 0 deletions Lib/test/test_codecs.py
Expand Up @@ -401,6 +401,15 @@ def test_lone_surrogates(self):
self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
before + backslashreplace + after)

def test_incremental_surrogatepass(self):
# Test incremental decoder for surrogatepass handler:
# see issue #24214
data = '\uD901'.encode(self.encoding, 'surrogatepass')
for i in range(1, len(data)):
dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass')
self.assertEqual(dec.decode(data[:i]), '')
self.assertEqual(dec.decode(data[i:], True), '\uD901')


class UTF32Test(ReadTest, unittest.TestCase):
encoding = "utf-32"
Expand Down
@@ -0,0 +1,2 @@
Fixed support of the surrogatepass error handler in the UTF-8 incremental
decoder.
3 changes: 3 additions & 0 deletions Objects/unicodeobject.c
Expand Up @@ -4890,6 +4890,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
case 2:
case 3:
case 4:
if (s == end || consumed) {
goto End;
}
errmsg = "invalid continuation byte";
startinpos = s - starts;
endinpos = startinpos + ch - 1;
Expand Down

0 comments on commit bd48280

Please sign in to comment.