From f71768984ba7f50b0476c17a4f3b3f2ca88a6951 Mon Sep 17 00:00:00 2001 From: Reini Urban Date: Wed, 23 Nov 2016 19:24:37 +0100 Subject: [PATCH] Detect decode_utf8 overflows add tests from Encode t/utf8strict.t for handling ill-formed subsequences, esp. overflows and non-continuations. Add special code for perl 5.6 to handle these, as they are security relevant. See https://github.com/dankogai/p5-encode/issues/64 and https://github.com/rurban/Cpanel-JSON-XS/issues/77 --- XS.xs | 67 +++++++++++++++++++++++++++++++++--------- t/01_utf8.t | 83 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 134 insertions(+), 16 deletions(-) diff --git a/XS.xs b/XS.xs index 5c82340..43cf73a 100644 --- a/XS.xs +++ b/XS.xs @@ -379,8 +379,8 @@ INLINE UV decode_utf8 (pTHX_ unsigned char *s, STRLEN len, int relaxed, STRLEN *clen) { if (LIKELY(len >= 2 - && IN_RANGE_INC (char, s[0], 0xc2, 0xdf) - && IN_RANGE_INC (char, s[1], 0x80, 0xbf))) + && IN_RANGE_INC (char, s[0], 0xc2, 0xdf) + && IN_RANGE_INC (char, s[1], 0x80, 0xbf))) { *clen = 2; return ((s[0] & 0x1f) << 6) | (s[1] & 0x3f); @@ -391,7 +391,7 @@ decode_utf8 (pTHX_ unsigned char *s, STRLEN len, int relaxed, STRLEN *clen) We accept only valid unicode, unless we are in the relaxed mode. */ #if PERL_VERSION > 12 UV c = utf8n_to_uvuni (s, len, clen, - UTF8_CHECK_ONLY | (relaxed ? 0 : UTF8_DISALLOW_SUPER)); + UTF8_CHECK_ONLY | (relaxed ? 0 : UTF8_DISALLOW_SUPER)); #elif PERL_VERSION >= 8 UV c = utf8n_to_uvuni (s, len, clen, UTF8_CHECK_ONLY); #endif @@ -402,31 +402,70 @@ decode_utf8 (pTHX_ unsigned char *s, STRLEN len, int relaxed, STRLEN *clen) #if PERL_VERSION >= 8 return c; #else - /* for perl 5.6 */ + /* 5.6 does not detect certain ill-formed sequences, esp. overflows, + which are security relevant. so we add code to detect these. */ UV c = utf8_to_uv(s, len, clen, UTF8_CHECK_ONLY); - if (c > PERL_UNICODE_MAX && !relaxed) - *clen = -1; + if (!relaxed) { + if (!c || c > PERL_UNICODE_MAX) + *clen = -1; + /* need to check manually for some overflows. 5.6 unicode bug */ + else if (len >= 2 + && IN_RANGE_INC (char, s[0], 0xc0, 0xfe) + && !IN_RANGE_INC (char, s[0], 0xc2, 0xdf)) { + U8 *s0, *send; + UV uv = *s; + UV expectlen = UTF8SKIP(s); + +#define UTF_CONTINUATION_MASK ((U8) ((1U << 6) - 1)) +#define UTF_ACCUMULATION_OVERFLOW_MASK \ + (((UV) UTF_CONTINUATION_MASK) << ((sizeof(UV) * 8) - 6)) + + s0 = s; + /*printf ("maybe overlong <%.*s> %d/%d %x %x\n", len, s, c, + *clen, s[0], s[1]);*/ + if (*clen > 4) { + *clen = -1; + return c; + } + send = (U8*) s0 + ((expectlen <= len) ? len : len); + for (s = s0 + 1; s < send; s++) { + if (LIKELY(UTF8_IS_CONTINUATION(*s))) { + if (uv & UTF_ACCUMULATION_OVERFLOW_MASK) { + /*printf ("overflow\n");*/ + *clen = -1; + return c; + } + uv = UTF8_ACCUMULATE(uv, *s); + } + else { + /*printf ("unexpected non continuation\n");*/ + *clen = -1; + return c; + } + } + } + } return c; #endif } } -/* likewise for encoding, also never called for ascii codepoints */ -/* this function takes advantage of this fact, although current gccs */ -/* seem to optimise the check for >= 0x80 away anyways */ +/* Likewise for encoding, also never called for ascii codepoints. */ +/* This function takes advantage of this fact, although current gcc's */ +/* seem to optimise the check for >= 0x80 away anyways. */ INLINE unsigned char * encode_utf8 (unsigned char *s, UV ch) { - if (UNLIKELY(ch < 0x000080)) + if (UNLIKELY(ch < 0x000080)) *s++ = ch; else if (LIKELY(ch < 0x000800)) *s++ = 0xc0 | ( ch >> 6), *s++ = 0x80 | ( ch & 0x3f); - else if ( ch < 0x010000) + else if (ch < 0x010000) *s++ = 0xe0 | ( ch >> 12), *s++ = 0x80 | ((ch >> 6) & 0x3f), *s++ = 0x80 | ( ch & 0x3f); - else if ( ch < 0x110000) + else if (ch < 0x110000) *s++ = 0xf0 | ( ch >> 18), *s++ = 0x80 | ((ch >> 12) & 0x3f), *s++ = 0x80 | ((ch >> 6) & 0x3f), @@ -787,8 +826,8 @@ encode_str (pTHX_ enc_t *enc, char *str, STRLEN len, int is_utf8) while (--clen); } else - { - need (aTHX_ enc, len += UTF8_MAXBYTES - 1); /* never more than 11 bytes needed */ + { /* never more than 11 bytes needed */ + need (aTHX_ enc, len += UTF8_MAXBYTES - 1); enc->cur = (char*)encode_utf8 ((U8*)enc->cur, uch); ++str; } diff --git a/t/01_utf8.t b/t/01_utf8.t index 72eef49..c30f7e4 100644 --- a/t/01_utf8.t +++ b/t/01_utf8.t @@ -1,4 +1,4 @@ -use Test::More tests => 23; +use Test::More tests => 155; use utf8; use Cpanel::JSON::XS; @@ -16,7 +16,12 @@ SKIP: { is(Cpanel::JSON::XS->new->allow_nonref (1)->decode ('"ü"'), "ü"); is(Cpanel::JSON::XS->new->allow_nonref (1)->decode ('"\u00fc"'), "ü"); -is(Cpanel::JSON::XS->new->allow_nonref (1)->decode ('"\ud801\udc02' . "\x{10204}\""), "\x{10402}\x{10204}"); +if ($] < 5.008) { + eval { decode_json ('"\ud801\udc02' . "\x{10204}\"", 1) }; + like $@, qr/malformed UTF-8/; +} else { + is(Cpanel::JSON::XS->new->allow_nonref (1)->decode ('"\ud801\udc02' . "\x{10204}\""), "\x{10402}\x{10204}"); +} is(Cpanel::JSON::XS->new->allow_nonref (1)->decode ('"\"\n\\\\\r\t\f\b"'), "\"\012\\\015\011\014\010"); my $love = $] < 5.008 ? "I \342\235\244 perl" : "I ❤ perl"; @@ -80,3 +85,77 @@ is(Cpanel::JSON::XS->new->binary->encode ([$love]), '["I \xe2\x9d\xa4 perl"]', ' is ($d, "\x{fdd0}", "no warning with relaxed"); is($w, undef); } + +# security exploits via ill-formed subsequences +# see http://unicode.org/reports/tr36/#UTF-8_Exploit +# testcases from Encode/t/utf8strict.t +# All these sequences are not handled by the unsafe, fast XS decoder, +# rather passed through to the safe Perl decoder, which detects those. +my @ill = + (# http://smontagu.damowmow.com/utf8test.html + # The numbers below, like 2.1.2 are test numbers on this web page + qq/80/ , # 3.1.1 + qq/bf/ , # 3.1.2 + qq/80 bf/ , # 3.1.3 + qq/80 bf 80/ , # 3.1.4 + qq/80 bf 80 bf/ , # 3.1.5 + qq/80 bf 80 bf 80/ , # 3.1.6 + qq/80 bf 80 bf 80 bf/ , # 3.1.7 + qq/80 bf 80 bf 80 bf 80/ , # 3.1.8 + qq/80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf/ , # 3.1.9 + qq/c0 20 c1 20 c2 20 c3 20 c4 20 c5 20 c6 20 c7 20 c8 20 c9 20 ca 20 cb 20 cc 20 cd 20 ce 20 cf 20 d0 20 d1 20 d2 20 d3 20 d4 20 d5 20 d6 20 d7 20 d8 20 d9 20 da 20 db 20 dc 20 dd 20 de 20 df 20/ , # 3.2.1 + qq/e0 20 e1 20 e2 20 e3 20 e4 20 e5 20 e6 20 e7 20 e8 20 e9 20 ea 20 eb 20 ec 20 ed 20 ee 20 ef 20/ , # 3.2.2 + qq/f0 20 f1 20 f2 20 f3 20 f4 20 f5 20 f6 20 f7 20/ , # 3.2.3 + qq/f8 20 f9 20 fa 20 fb 20/ , # 3.2.4 + qq/fc 20 fd 20/ , # 3.2.5 + qq/c0/ , # 3.3.1 + qq/e0 80/ , # 3.3.2 + qq/f0 80 80/ , # 3.3.3 + qq/f8 80 80 80/ , # 3.3.4 + qq/fc 80 80 80 80/ , # 3.3.5 + qq/df/ , # 3.3.6 + qq/ef bf/ , # 3.3.7 + qq/f7 bf bf/ , # 3.3.8 + qq/fb bf bf bf/ , # 3.3.9 + qq/fd bf bf bf bf/ , # 3.3.10 + qq/c0 e0 80 f0 80 80 f8 80 80 80 fc 80 80 80 80 df ef bf f7 bf bf fb bf bf bf fd bf bf bf bf/ , # 3.4.1 + qq/fe/ , # 3.5.1 + qq/ff/ , # 3.5.2 + qq/fe fe ff ff/ , # 3.5.3 + qq/f0 8f bf bf/ , # 4.2.3 + qq/f8 87 bf bf bf/ , # 4.2.4 + qq/fc 83 bf bf bf bf/ , # 4.2.5 + qq/c0 af/ , # 4.1.1 # ! overflow not with perl 5.6 + qq/e0 80 af/ , # 4.1.2 # ! overflow not with perl 5.6 + qq/f0 80 80 af/ , # 4.1.3 # ! overflow not with perl 5.6 + qq/f8 80 80 80 af/ , # 4.1.4 # ! overflow not with perl 5.6 + qq/fc 80 80 80 80 af/ , # 4.1.5 # ! overflow not with perl 5.6 + qq/c1 bf/ , # 4.2.1 # ! overflow not with perl 5.6 + qq/e0 9f bf/ , # 4.2.2 # ! overflow not with perl 5.6 + qq/c0 80/ , # 4.3.1 # xx! overflow not with perl 5.6 + qq/e0 80 80/ , # 4.3.2 # xx! overflow not with perl 5.6 + qq/f0 80 80 80/ , # 4.3.3 # xx! overflow not with perl 5.6 + qq/f8 80 80 80 80/ , # 4.3.4 # xx! overflow not with perl 5.6 + qq/fc 80 80 80 80 80/ , # 4.3.5 # xx! overflow not with perl 5.6 + # non-shortest form of 5c i.e. "\\" + qq/c1 9c/ , # ! not with perl 5.6 + ); + +{ + # these are no multibyte codepoints, just raw utf8 bytes, + # so most of them work with 5.6 also. + $^W = 1; + my $w; + warnings->import($] < 5.014 ? 'utf8' : 'nonchar'); + $SIG{__WARN__} = sub { $w = shift }; + + for my $ill (@ill) { + my $o = pack "C*" => map {hex} split /\s+/, $ill; + my $d = eval { decode_json("[\"$o\"]"); }; + is ($d, undef, substr($@,0,25)) + or diag $w, ' ', $ill, "\t => ", $d->[0], " $@"; + like($@, qr/malformed UTF-8 character/, "ill-formed utf8 <$ill> throws error"); + is($d, undef, "without warning"); + $w = undef; + } +}