From f71768984ba7f50b0476c17a4f3b3f2ca88a6951 Mon Sep 17 00:00:00 2001
From: Reini Urban <rurban@cpanel.net>
Date: Wed, 23 Nov 2016 19:24:37 +0100
Subject: [PATCH] Detect decode_utf8 overflows

add tests from Encode t/utf8strict.t for handling ill-formed
subsequences, esp. overflows and non-continuations.
Add special code for perl 5.6 to handle these, as they are
security relevant. See https://github.com/dankogai/p5-encode/issues/64
and https://github.com/rurban/Cpanel-JSON-XS/issues/77
---
 XS.xs       | 67 +++++++++++++++++++++++++++++++++---------
 t/01_utf8.t | 83 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 134 insertions(+), 16 deletions(-)

diff --git a/XS.xs b/XS.xs
index 5c82340..43cf73a 100644
--- a/XS.xs
+++ b/XS.xs
@@ -379,8 +379,8 @@ INLINE UV
 decode_utf8 (pTHX_ unsigned char *s, STRLEN len, int relaxed, STRLEN *clen)
 {
   if (LIKELY(len >= 2
-                   && IN_RANGE_INC (char, s[0], 0xc2, 0xdf)
-                   && IN_RANGE_INC (char, s[1], 0x80, 0xbf)))
+             && IN_RANGE_INC (char, s[0], 0xc2, 0xdf)
+             && IN_RANGE_INC (char, s[1], 0x80, 0xbf)))
     {
       *clen = 2;
       return ((s[0] & 0x1f) << 6) | (s[1] & 0x3f);
@@ -391,7 +391,7 @@ decode_utf8 (pTHX_ unsigned char *s, STRLEN len, int relaxed, STRLEN *clen)
    We accept only valid unicode, unless we are in the relaxed mode. */
 #if PERL_VERSION > 12
     UV c = utf8n_to_uvuni (s, len, clen,
-               UTF8_CHECK_ONLY | (relaxed ? 0 : UTF8_DISALLOW_SUPER));
+                           UTF8_CHECK_ONLY | (relaxed ? 0 : UTF8_DISALLOW_SUPER));
 #elif PERL_VERSION >= 8
     UV c = utf8n_to_uvuni (s, len, clen, UTF8_CHECK_ONLY);
 #endif
@@ -402,31 +402,70 @@ decode_utf8 (pTHX_ unsigned char *s, STRLEN len, int relaxed, STRLEN *clen)
 #if PERL_VERSION >= 8
     return c;
 #else
-    /* for perl 5.6 */
+    /* 5.6 does not detect certain ill-formed sequences, esp. overflows,
+       which are security relevant. so we add code to detect these. */
     UV c = utf8_to_uv(s, len, clen, UTF8_CHECK_ONLY);
-    if (c > PERL_UNICODE_MAX && !relaxed)
-      *clen = -1;
+    if (!relaxed) {
+      if (!c || c > PERL_UNICODE_MAX)
+        *clen = -1;
+      /* need to check manually for some overflows. 5.6 unicode bug */
+      else if (len >= 2
+               && IN_RANGE_INC (char, s[0], 0xc0, 0xfe)
+               && !IN_RANGE_INC (char, s[0], 0xc2, 0xdf)) {
+        U8 *s0, *send;
+        UV uv = *s;
+        UV expectlen = UTF8SKIP(s);
+
+#define UTF_CONTINUATION_MASK           ((U8) ((1U << 6) - 1))
+#define UTF_ACCUMULATION_OVERFLOW_MASK                          \
+        (((UV) UTF_CONTINUATION_MASK) << ((sizeof(UV) * 8) - 6))
+
+        s0 = s;
+        /*printf ("maybe overlong <%.*s> %d/%d %x %x\n", len, s, c,
+                  *clen, s[0], s[1]);*/
+        if (*clen > 4) {
+          *clen = -1;
+          return c;
+        }
+        send = (U8*) s0 + ((expectlen <= len) ? len : len);
+        for (s = s0 + 1; s < send; s++) {
+          if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
+            if (uv & UTF_ACCUMULATION_OVERFLOW_MASK) {
+              /*printf ("overflow\n");*/
+              *clen = -1;
+              return c;
+	    }
+	    uv = UTF8_ACCUMULATE(uv, *s);
+          }
+	  else {
+            /*printf ("unexpected non continuation\n");*/
+            *clen = -1;
+            return c;
+          }
+	}
+      }
+    }
     return c;
 #endif
   }
 }
 
-/* likewise for encoding, also never called for ascii codepoints */
-/* this function takes advantage of this fact, although current gccs */
-/* seem to optimise the check for >= 0x80 away anyways */
+/* Likewise for encoding, also never called for ascii codepoints. */
+/* This function takes advantage of this fact, although current gcc's */
+/* seem to optimise the check for >= 0x80 away anyways. */
 INLINE unsigned char *
 encode_utf8 (unsigned char *s, UV ch)
 {
-  if      (UNLIKELY(ch < 0x000080))
+  if    (UNLIKELY(ch < 0x000080))
     *s++ = ch;
   else if (LIKELY(ch < 0x000800))
     *s++ = 0xc0 | ( ch >>  6),
     *s++ = 0x80 | ( ch        & 0x3f);
-  else if (              ch < 0x010000)
+  else if        (ch < 0x010000)
     *s++ = 0xe0 | ( ch >> 12),
     *s++ = 0x80 | ((ch >>  6) & 0x3f),
     *s++ = 0x80 | ( ch        & 0x3f);
-  else if (              ch < 0x110000)
+  else if        (ch < 0x110000)
     *s++ = 0xf0 | ( ch >> 18),
     *s++ = 0x80 | ((ch >> 12) & 0x3f),
     *s++ = 0x80 | ((ch >>  6) & 0x3f),
@@ -787,8 +826,8 @@ encode_str (pTHX_ enc_t *enc, char *str, STRLEN len, int is_utf8)
                       while (--clen);
                     }
                   else
-                    {
-                      need (aTHX_ enc, len += UTF8_MAXBYTES - 1); /* never more than 11 bytes needed */
+                    { /* never more than 11 bytes needed */
+                      need (aTHX_ enc, len += UTF8_MAXBYTES - 1);
                       enc->cur = (char*)encode_utf8 ((U8*)enc->cur, uch);
                       ++str;
                     }
diff --git a/t/01_utf8.t b/t/01_utf8.t
index 72eef49..c30f7e4 100644
--- a/t/01_utf8.t
+++ b/t/01_utf8.t
@@ -1,4 +1,4 @@
-use Test::More tests => 23;
+use Test::More tests => 155;
 use utf8;
 use Cpanel::JSON::XS;
 
@@ -16,7 +16,12 @@ SKIP: {
 
 is(Cpanel::JSON::XS->new->allow_nonref (1)->decode ('"ü"'), "ü");
 is(Cpanel::JSON::XS->new->allow_nonref (1)->decode ('"\u00fc"'), "ü");
-is(Cpanel::JSON::XS->new->allow_nonref (1)->decode ('"\ud801\udc02' . "\x{10204}\""), "\x{10402}\x{10204}");
+if ($] < 5.008) {
+  eval { decode_json ('"\ud801\udc02' . "\x{10204}\"", 1) };
+  like $@, qr/malformed UTF-8/;
+} else {
+  is(Cpanel::JSON::XS->new->allow_nonref (1)->decode ('"\ud801\udc02' . "\x{10204}\""), "\x{10402}\x{10204}");
+}
 is(Cpanel::JSON::XS->new->allow_nonref (1)->decode ('"\"\n\\\\\r\t\f\b"'), "\"\012\\\015\011\014\010");
 
 my $love = $] < 5.008 ? "I \342\235\244 perl" : "I ❤ perl";
@@ -80,3 +85,77 @@ is(Cpanel::JSON::XS->new->binary->encode ([$love]), '["I \xe2\x9d\xa4 perl"]', '
   is ($d, "\x{fdd0}", "no warning with relaxed");
   is($w, undef);
 }
+
+# security exploits via ill-formed subsequences
+# see http://unicode.org/reports/tr36/#UTF-8_Exploit
+# testcases from Encode/t/utf8strict.t
+# All these sequences are not handled by the unsafe, fast XS decoder,
+# rather passed through to the safe Perl decoder, which detects those.
+my @ill =
+  (# http://smontagu.damowmow.com/utf8test.html
+   # The numbers below, like 2.1.2 are test numbers on this web page
+   qq/80/          ,             # 3.1.1
+   qq/bf/          ,             # 3.1.2
+   qq/80 bf/       ,             # 3.1.3
+   qq/80 bf 80/    ,             # 3.1.4
+   qq/80 bf 80 bf/ ,             # 3.1.5
+   qq/80 bf 80 bf 80/ ,          # 3.1.6
+   qq/80 bf 80 bf 80 bf/ ,       # 3.1.7
+   qq/80 bf 80 bf 80 bf 80/ ,    # 3.1.8
+   qq/80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf/ , # 3.1.9
+   qq/c0 20 c1 20 c2 20 c3 20 c4 20 c5 20 c6 20 c7 20 c8 20 c9 20 ca 20 cb 20 cc 20 cd 20 ce 20 cf 20 d0 20 d1 20 d2 20 d3 20 d4 20 d5 20 d6 20 d7 20 d8 20 d9 20 da 20 db 20 dc 20 dd 20 de 20 df 20/ , # 3.2.1
+   qq/e0 20 e1 20 e2 20 e3 20 e4 20 e5 20 e6 20 e7 20 e8 20 e9 20 ea 20 eb 20 ec 20 ed 20 ee 20 ef 20/ , # 3.2.2
+   qq/f0 20 f1 20 f2 20 f3 20 f4 20 f5 20 f6 20 f7 20/ , # 3.2.3
+   qq/f8 20 f9 20 fa 20 fb 20/ , # 3.2.4
+   qq/fc 20 fd 20/ ,             # 3.2.5
+   qq/c0/ ,                      # 3.3.1
+   qq/e0 80/ ,                   # 3.3.2
+   qq/f0 80 80/ ,                # 3.3.3
+   qq/f8 80 80 80/ ,             # 3.3.4
+   qq/fc 80 80 80 80/ ,          # 3.3.5
+   qq/df/ ,                      # 3.3.6
+   qq/ef bf/ ,                   # 3.3.7
+   qq/f7 bf bf/ ,                # 3.3.8
+   qq/fb bf bf bf/ ,             # 3.3.9
+   qq/fd bf bf bf bf/ ,          # 3.3.10
+   qq/c0 e0 80 f0 80 80 f8 80 80 80 fc 80 80 80 80 df ef bf f7 bf bf fb bf bf bf fd bf bf bf bf/ , # 3.4.1
+   qq/fe/ ,                      # 3.5.1
+   qq/ff/ ,                      # 3.5.2
+   qq/fe fe ff ff/ ,             # 3.5.3
+   qq/f0 8f bf bf/ ,             # 4.2.3
+   qq/f8 87 bf bf bf/ ,          # 4.2.4
+   qq/fc 83 bf bf bf bf/ ,       # 4.2.5
+   qq/c0 af/ ,                   # 4.1.1  # ! overflow not with perl 5.6
+   qq/e0 80 af/ ,                # 4.1.2  # ! overflow not with perl 5.6
+   qq/f0 80 80 af/ ,             # 4.1.3  # ! overflow not with perl 5.6
+   qq/f8 80 80 80 af/ ,          # 4.1.4  # ! overflow not with perl 5.6
+   qq/fc 80 80 80 80 af/ ,       # 4.1.5  # ! overflow not with perl 5.6
+   qq/c1 bf/ ,                   # 4.2.1  # ! overflow not with perl 5.6
+   qq/e0 9f bf/ ,                # 4.2.2  # ! overflow not with perl 5.6
+   qq/c0 80/ ,                   # 4.3.1  # xx! overflow not with perl 5.6
+   qq/e0 80 80/ ,                # 4.3.2  # xx! overflow not with perl 5.6
+   qq/f0 80 80 80/ ,             # 4.3.3  # xx! overflow not with perl 5.6
+   qq/f8 80 80 80 80/ ,          # 4.3.4  # xx! overflow not with perl 5.6
+   qq/fc 80 80 80 80 80/ ,       # 4.3.5  # xx! overflow not with perl 5.6
+   # non-shortest form of 5c i.e. "\\"
+   qq/c1 9c/ ,                            # ! not with perl 5.6
+  );
+
+{
+  # these are no multibyte codepoints, just raw utf8 bytes,
+  # so most of them work with 5.6 also.
+  $^W = 1;
+  my $w;
+  warnings->import($] < 5.014 ? 'utf8' : 'nonchar');
+  $SIG{__WARN__} = sub { $w = shift };
+
+  for my $ill (@ill) {
+    my $o = pack "C*" => map {hex} split /\s+/, $ill;
+    my $d = eval { decode_json("[\"$o\"]"); };
+    is ($d, undef, substr($@,0,25))
+      or diag $w, ' ', $ill, "\t => ", $d->[0], " $@";
+    like($@, qr/malformed UTF-8 character/, "ill-formed utf8 <$ill> throws error");
+    is($d, undef, "without warning");
+    $w = undef;
+  }
+}