Skip to content

Commit

Permalink
Detect decode_utf8 overflows
Browse files Browse the repository at this point in the history
add tests from Encode t/utf8strict.t for handling ill-formed
subsequences, esp. overflows and non-continuations.
Add special code for perl 5.6 to handle these, as they are
security relevant. See dankogai/p5-encode#64
and #77
  • Loading branch information
Reini Urban committed Nov 23, 2016
1 parent 22fda35 commit f717689
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 16 deletions.
67 changes: 53 additions & 14 deletions XS.xs
Expand Up @@ -379,8 +379,8 @@ INLINE UV
decode_utf8 (pTHX_ unsigned char *s, STRLEN len, int relaxed, STRLEN *clen)
{
if (LIKELY(len >= 2
&& IN_RANGE_INC (char, s[0], 0xc2, 0xdf)
&& IN_RANGE_INC (char, s[1], 0x80, 0xbf)))
&& IN_RANGE_INC (char, s[0], 0xc2, 0xdf)
&& IN_RANGE_INC (char, s[1], 0x80, 0xbf)))
{
*clen = 2;
return ((s[0] & 0x1f) << 6) | (s[1] & 0x3f);
Expand All @@ -391,7 +391,7 @@ decode_utf8 (pTHX_ unsigned char *s, STRLEN len, int relaxed, STRLEN *clen)
We accept only valid unicode, unless we are in the relaxed mode. */
#if PERL_VERSION > 12
UV c = utf8n_to_uvuni (s, len, clen,
UTF8_CHECK_ONLY | (relaxed ? 0 : UTF8_DISALLOW_SUPER));
UTF8_CHECK_ONLY | (relaxed ? 0 : UTF8_DISALLOW_SUPER));
#elif PERL_VERSION >= 8
UV c = utf8n_to_uvuni (s, len, clen, UTF8_CHECK_ONLY);
#endif
Expand All @@ -402,31 +402,70 @@ decode_utf8 (pTHX_ unsigned char *s, STRLEN len, int relaxed, STRLEN *clen)
#if PERL_VERSION >= 8
return c;
#else
/* for perl 5.6 */
/* 5.6 does not detect certain ill-formed sequences, esp. overflows,
which are security relevant. so we add code to detect these. */
UV c = utf8_to_uv(s, len, clen, UTF8_CHECK_ONLY);
if (c > PERL_UNICODE_MAX && !relaxed)
*clen = -1;
if (!relaxed) {
if (!c || c > PERL_UNICODE_MAX)
*clen = -1;
/* need to check manually for some overflows. 5.6 unicode bug */
else if (len >= 2
&& IN_RANGE_INC (char, s[0], 0xc0, 0xfe)
&& !IN_RANGE_INC (char, s[0], 0xc2, 0xdf)) {
U8 *s0, *send;
UV uv = *s;
UV expectlen = UTF8SKIP(s);

#define UTF_CONTINUATION_MASK ((U8) ((1U << 6) - 1))
#define UTF_ACCUMULATION_OVERFLOW_MASK \
(((UV) UTF_CONTINUATION_MASK) << ((sizeof(UV) * 8) - 6))

s0 = s;
/*printf ("maybe overlong <%.*s> %d/%d %x %x\n", len, s, c,
*clen, s[0], s[1]);*/
if (*clen > 4) {
*clen = -1;
return c;
}
send = (U8*) s0 + ((expectlen <= len) ? len : len);
for (s = s0 + 1; s < send; s++) {
if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
if (uv & UTF_ACCUMULATION_OVERFLOW_MASK) {
/*printf ("overflow\n");*/
*clen = -1;
return c;
}
uv = UTF8_ACCUMULATE(uv, *s);
}
else {
/*printf ("unexpected non continuation\n");*/
*clen = -1;
return c;
}
}
}
}
return c;
#endif
}
}

/* likewise for encoding, also never called for ascii codepoints */
/* this function takes advantage of this fact, although current gccs */
/* seem to optimise the check for >= 0x80 away anyways */
/* Likewise for encoding, also never called for ascii codepoints. */
/* This function takes advantage of this fact, although current gcc's */
/* seem to optimise the check for >= 0x80 away anyways. */
INLINE unsigned char *
encode_utf8 (unsigned char *s, UV ch)
{
if (UNLIKELY(ch < 0x000080))
if (UNLIKELY(ch < 0x000080))
*s++ = ch;
else if (LIKELY(ch < 0x000800))
*s++ = 0xc0 | ( ch >> 6),
*s++ = 0x80 | ( ch & 0x3f);
else if ( ch < 0x010000)
else if (ch < 0x010000)
*s++ = 0xe0 | ( ch >> 12),
*s++ = 0x80 | ((ch >> 6) & 0x3f),
*s++ = 0x80 | ( ch & 0x3f);
else if ( ch < 0x110000)
else if (ch < 0x110000)
*s++ = 0xf0 | ( ch >> 18),
*s++ = 0x80 | ((ch >> 12) & 0x3f),
*s++ = 0x80 | ((ch >> 6) & 0x3f),
Expand Down Expand Up @@ -787,8 +826,8 @@ encode_str (pTHX_ enc_t *enc, char *str, STRLEN len, int is_utf8)
while (--clen);
}
else
{
need (aTHX_ enc, len += UTF8_MAXBYTES - 1); /* never more than 11 bytes needed */
{ /* never more than 11 bytes needed */
need (aTHX_ enc, len += UTF8_MAXBYTES - 1);
enc->cur = (char*)encode_utf8 ((U8*)enc->cur, uch);
++str;
}
Expand Down
83 changes: 81 additions & 2 deletions t/01_utf8.t
@@ -1,4 +1,4 @@
use Test::More tests => 23;
use Test::More tests => 155;
use utf8;
use Cpanel::JSON::XS;

Expand All @@ -16,7 +16,12 @@ SKIP: {

is(Cpanel::JSON::XS->new->allow_nonref (1)->decode ('"ü"'), "ü");
is(Cpanel::JSON::XS->new->allow_nonref (1)->decode ('"\u00fc"'), "ü");
is(Cpanel::JSON::XS->new->allow_nonref (1)->decode ('"\ud801\udc02' . "\x{10204}\""), "\x{10402}\x{10204}");
if ($] < 5.008) {
eval { decode_json ('"\ud801\udc02' . "\x{10204}\"", 1) };
like $@, qr/malformed UTF-8/;
} else {
is(Cpanel::JSON::XS->new->allow_nonref (1)->decode ('"\ud801\udc02' . "\x{10204}\""), "\x{10402}\x{10204}");
}
is(Cpanel::JSON::XS->new->allow_nonref (1)->decode ('"\"\n\\\\\r\t\f\b"'), "\"\012\\\015\011\014\010");

my $love = $] < 5.008 ? "I \342\235\244 perl" : "I ❤ perl";
Expand Down Expand Up @@ -80,3 +85,77 @@ is(Cpanel::JSON::XS->new->binary->encode ([$love]), '["I \xe2\x9d\xa4 perl"]', '
is ($d, "\x{fdd0}", "no warning with relaxed");
is($w, undef);
}

# security exploits via ill-formed subsequences
# see http://unicode.org/reports/tr36/#UTF-8_Exploit
# testcases from Encode/t/utf8strict.t
# All these sequences are not handled by the unsafe, fast XS decoder,
# rather passed through to the safe Perl decoder, which detects those.
my @ill =
(# http://smontagu.damowmow.com/utf8test.html
# The numbers below, like 2.1.2 are test numbers on this web page
qq/80/ , # 3.1.1
qq/bf/ , # 3.1.2
qq/80 bf/ , # 3.1.3
qq/80 bf 80/ , # 3.1.4
qq/80 bf 80 bf/ , # 3.1.5
qq/80 bf 80 bf 80/ , # 3.1.6
qq/80 bf 80 bf 80 bf/ , # 3.1.7
qq/80 bf 80 bf 80 bf 80/ , # 3.1.8
qq/80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf/ , # 3.1.9
qq/c0 20 c1 20 c2 20 c3 20 c4 20 c5 20 c6 20 c7 20 c8 20 c9 20 ca 20 cb 20 cc 20 cd 20 ce 20 cf 20 d0 20 d1 20 d2 20 d3 20 d4 20 d5 20 d6 20 d7 20 d8 20 d9 20 da 20 db 20 dc 20 dd 20 de 20 df 20/ , # 3.2.1
qq/e0 20 e1 20 e2 20 e3 20 e4 20 e5 20 e6 20 e7 20 e8 20 e9 20 ea 20 eb 20 ec 20 ed 20 ee 20 ef 20/ , # 3.2.2
qq/f0 20 f1 20 f2 20 f3 20 f4 20 f5 20 f6 20 f7 20/ , # 3.2.3
qq/f8 20 f9 20 fa 20 fb 20/ , # 3.2.4
qq/fc 20 fd 20/ , # 3.2.5
qq/c0/ , # 3.3.1
qq/e0 80/ , # 3.3.2
qq/f0 80 80/ , # 3.3.3
qq/f8 80 80 80/ , # 3.3.4
qq/fc 80 80 80 80/ , # 3.3.5
qq/df/ , # 3.3.6
qq/ef bf/ , # 3.3.7
qq/f7 bf bf/ , # 3.3.8
qq/fb bf bf bf/ , # 3.3.9
qq/fd bf bf bf bf/ , # 3.3.10
qq/c0 e0 80 f0 80 80 f8 80 80 80 fc 80 80 80 80 df ef bf f7 bf bf fb bf bf bf fd bf bf bf bf/ , # 3.4.1
qq/fe/ , # 3.5.1
qq/ff/ , # 3.5.2
qq/fe fe ff ff/ , # 3.5.3
qq/f0 8f bf bf/ , # 4.2.3
qq/f8 87 bf bf bf/ , # 4.2.4
qq/fc 83 bf bf bf bf/ , # 4.2.5
qq/c0 af/ , # 4.1.1 # ! overflow not with perl 5.6
qq/e0 80 af/ , # 4.1.2 # ! overflow not with perl 5.6
qq/f0 80 80 af/ , # 4.1.3 # ! overflow not with perl 5.6
qq/f8 80 80 80 af/ , # 4.1.4 # ! overflow not with perl 5.6
qq/fc 80 80 80 80 af/ , # 4.1.5 # ! overflow not with perl 5.6
qq/c1 bf/ , # 4.2.1 # ! overflow not with perl 5.6
qq/e0 9f bf/ , # 4.2.2 # ! overflow not with perl 5.6
qq/c0 80/ , # 4.3.1 # xx! overflow not with perl 5.6
qq/e0 80 80/ , # 4.3.2 # xx! overflow not with perl 5.6
qq/f0 80 80 80/ , # 4.3.3 # xx! overflow not with perl 5.6
qq/f8 80 80 80 80/ , # 4.3.4 # xx! overflow not with perl 5.6
qq/fc 80 80 80 80 80/ , # 4.3.5 # xx! overflow not with perl 5.6
# non-shortest form of 5c i.e. "\\"
qq/c1 9c/ , # ! not with perl 5.6
);

{
# these are no multibyte codepoints, just raw utf8 bytes,
# so most of them work with 5.6 also.
$^W = 1;
my $w;
warnings->import($] < 5.014 ? 'utf8' : 'nonchar');
$SIG{__WARN__} = sub { $w = shift };

for my $ill (@ill) {
my $o = pack "C*" => map {hex} split /\s+/, $ill;
my $d = eval { decode_json("[\"$o\"]"); };
is ($d, undef, substr($@,0,25))
or diag $w, ' ', $ill, "\t => ", $d->[0], " $@";
like($@, qr/malformed UTF-8 character/, "ill-formed utf8 <$ill> throws error");
is($d, undef, "without warning");
$w = undef;
}
}

0 comments on commit f717689

Please sign in to comment.