breaking: big unicode overhaul

Allow utf-8 in character literals Char Unicode escape syntax. Validate zig as UTF-8. overhaul std.unicode Fully implemented in stage2. Closes: ziglang#2097 Closes: ziglang#2129 --- About the UTF-8 validation in stage1: This implementation is quite slow, but the stage automata it claims to represent is correct, and it has two features faster validators don't that would make the code in stage1 more complicated: * They don't provide the char point * They don't provide the index of the error (although this could be hacked in, but at more cost) I don't want to put that much optimization effort into stage1 and C code.
shawnl · Apr 1, 2019 · eea8e77 · eea8e77
1 parent c5ee98e
commit eea8e77
Show file tree

Hide file tree

Showing 10 changed files with 531 additions and 236 deletions.
diff --git a/doc/langref.html.in b/doc/langref.html.in
@@ -522,7 +522,8 @@ test "string literals" {
     assert(normal_bytes.len == 5);
     assert(normal_bytes[1] == 'e');
     assert('e' == '\x65');
-    assert('\U01f4a9' == 128169);
+    assert('\u{01f4a9}' == 128169);
+    assert('💩' == 128169);
     assert(mem.eql(u8, "hello", "h\x65llo"));
 
     // A C string literal is a null terminated pointer.
@@ -572,11 +573,15 @@ test "string literals" {
           <td>hexadecimal 8-bit character code (2 digits)</td>
         </tr>
         <tr>
-            <td><code>\uNNNN</code></td>
+            <td><code>\u{NN}</code></td>
+          <td>hexadecimal 16-bit Unicode character code UTF-8 encoded (2 digits)</td>
+        </tr>
+        <tr>
+            <td><code>\u{NNNN}</code></td>
           <td>hexadecimal 16-bit Unicode character code UTF-8 encoded (4 digits)</td>
         </tr>
         <tr>
-            <td><code>\UNNNNNN</code></td>
+            <td><code>\u{NNNNNN}</code></td>
           <td>hexadecimal 24-bit Unicode character code UTF-8 encoded (6 digits)</td>
         </tr>
       </table>
@@ -9491,8 +9496,9 @@ eof &lt;- !.
 hex &lt;- [0-9a-fA-F]
 char_escape
     &lt;- "\\x" hex hex
-     / "\\u" hex hex hex hex
-     / "\\U" hex hex hex hex hex hex
+     / "\\u" { hex hex }
+     / "\\u" { hex hex hex hex }
+     / "\\u" { hex hex hex hex hex hex }
      / "\\" [nr\\t'"]
 char_char
     &lt;- char_escape

diff --git a/src-self-hosted/ir.zig b/src-self-hosted/ir.zig
@@ -1147,7 +1147,10 @@ pub const Builder = struct {
                 return irb.lvalWrap(scope, inst, lval);
             },
             ast.Node.Id.MultilineStringLiteral => return error.Unimplemented,
-            ast.Node.Id.CharLiteral => return error.Unimplemented,
+            ast.Node.Id.CharLiteral => {
+                const char_lit = @fieldParentPtr(ast.Node.CharLiteral, "base", node);
+                return irb.lvalWrap(scope, try irb.genCharLit(char_lit, scope), lval);
+            },
             ast.Node.Id.BoolLiteral => return error.Unimplemented,
             ast.Node.Id.NullLiteral => return error.Unimplemented,
             ast.Node.Id.UndefinedLiteral => return error.Unimplemented,
@@ -1333,8 +1336,7 @@ pub const Builder = struct {
         ) catch |err| switch (err) {
             error.OutOfMemory => return error.OutOfMemory,
             error.InvalidBase => unreachable,
-            error.InvalidCharForDigit => unreachable,
-            error.DigitTooLargeForBase => unreachable,
+            error.InvalidCharacter => unreachable,
         };
         errdefer int_val.base.deref(irb.comp);
 
@@ -1343,18 +1345,100 @@ pub const Builder = struct {
         return inst;
     }
 
+    pub fn genCharLit(irb: *Builder, char_lit: *ast.Node.CharLiteral, scope: *Scope) !*Inst {
+        const char_token = irb.code.tree_scope.tree.tokenSlice(char_lit.token);
+
+        var char: u21 = undefined;
+        got_char: {
+            if (char_token[1] == '\\') {
+                char = switch (char_token[2]) {
+                'x' => {
+                    const hi = charToDigit(char_token[off], 16) catch unreachable;
+                    const lo = charToDigit(char_token[off + 1], 16) catch unreachable;
+                    char |= ((hi << 4) | lo) << ((hex_escape_byes - 1) * 8);
+                    break :got_char;
+                },
+                'u' => {
+                    // char_token[3] == '{';
+                    if (char_token[6] == '}') {
+                        hex_escape_bytes = 1;
+                    } else if (char_token[8] == '}') {
+                        hex_escape_bytes = 2;
+                    } else if (char_token[10] == '}') {
+                        hex_escape_bytes = 3;
+                    } else {
+                        unreachable;
+                    }
+                    var off: u8 = 4;
+                    while (hex_escape_bytes > 0) : (hex_escape_bytes -= 1) {
+                        const hi = charToDigit(char_token[off], 16) catch unreachable;
+                        const lo = charToDigit(char_token[off + 1], 16) catch unreachable;
+                        char |= ((hi << 4) | lo) << ((hex_escape_byes - 1) * 8);
+                        off += 2;
+                    }
+                    break :got_char;
+                },
+                'n' => '\n',
+                'r' => '\r',
+                '\\' => '\\',
+                '\t' => '\t',
+                '\'' => '\'',
+                '\"' => '\"',
+                else => unreachable,
+                };
+                break :got_char;
+            }
+            // This could read one byte past the end of the file, except
+            // this guarantees to not read past the first character, and we
+            // have already validated the file as UTF-8.
+            _ = utf8Decode(char_token[1..4], &char);
+            break :got_char;
+        }
+
+        const comptime_int_type = Type.ComptimeInt.get(irb.comp);
+        defer comptime_int_type.base.base.deref(irb.comp);
+
+        const int_val = Value.Int.createFromCharLiteral(
+            irb.comp,
+            &comptime_int_type.base,
+            rest,
+        ) catch |err| switch (err) {
+            error.OutOfMemory => return error.OutOfMemory,
+        };
+        errdefer int_val.base.deref(irb.comp);
+
+        const inst = try irb.build(Inst.Const, scope, Span.token(char_lit.token), Inst.Const.Params{});
+        inst.val = IrVal{ .KnownValue = &int_val.base };
+        return inst;
+    }
+
     pub async fn genStrLit(irb: *Builder, str_lit: *ast.Node.StringLiteral, scope: *Scope) !*Inst {
         const str_token = irb.code.tree_scope.tree.tokenSlice(str_lit.token);
         const src_span = Span.token(str_lit.token);
 
         var bad_index: usize = undefined;
         var buf = std.zig.parseStringLiteral(irb.comp.gpa(), str_token, &bad_index) catch |err| switch (err) {
-            error.OutOfMemory => return error.OutOfMemory,
-            error.InvalidCharacter => {
+            .OutOfMemory => return error.OutOfMemory,
+            .UnicodeSurrogateHalf, .UnicodeCodepointTooLarge => {
+                var hex_string = if (mem.indexOfScalar(u8, str_token, '}')) |i| str_token[2..i] else str_token[2..str_token.len];
+                try irb.comp.addCompileError(
+                    irb.code.tree_scope,
+                    src_span,
+                    "Unicode codepoint U+{} cannot be represented in UTF-16 and is invalid",
+                    hex_string,
+                );
+                return error.SemanticAnalysisFailed;
+            },
+            .ExpectXDigit, .ExpectLCurly, .ExpectRCurly => {
                 try irb.comp.addCompileError(
                     irb.code.tree_scope,
                     src_span,
-                    "invalid character in string literal: '{c}'",
+                    "expected {}, got '{c}'",
+                    switch (err) {
+                    .ExpectXDigit => "hexidecimal digit",
+                    .ExpectLCurly => "left curly bracket '{'",
+                    .ExpectRCurly => "right curly bracket '}'",
+                    },
                     str_token[bad_index],
                 );
                 return error.SemanticAnalysisFailed;

diff --git a/src-self-hosted/value.zig b/src-self-hosted/value.zig
@@ -534,6 +534,27 @@ pub const Value = struct {
             return self;
         }
 
+        pub fn createFromCharLiteral(comp: *Compilation, typ: *Type, value: u21) !*Int {
+            const self = try comp.gpa().create(Value.Int);
+            self.* = Value.Int{
+                .base = Value{
+                    .id = Value.Id.Int,
+                    .typ = typ,
+                    .ref_count = std.atomic.Int(usize).init(1),
+                },
+                .big_int = undefined,
+            };
+            typ.base.ref();
+            errdefer comp.gpa().destroy(self);
+
+            self.big_int = try std.math.big.Int.init(comp.gpa());
+            errdefer self.big_int.deinit();
+
+            try self.big_int.set(value);
+
+            return self;
+        }
+
         pub fn getLlvmConst(self: *Int, ofile: *ObjectFile) !?*llvm.Value {
             switch (self.base.typ.id) {
                 Type.Id.Int => {

diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
@@ -7,6 +7,7 @@
 
 #include "tokenizer.hpp"
 #include "util.hpp"
+#include "utf8-lookup.h"
 
 #include <stdarg.h>
 #include <stdlib.h>
@@ -219,6 +220,7 @@ enum TokenizeState {
     TokenizeStateSawDotDot,
     TokenizeStateSawAtSign,
     TokenizeStateCharCode,
+    TokenizeStateCharCodeStart,
     TokenizeStateError,
     TokenizeStateLBracket,
     TokenizeStateLBracketStar,
@@ -238,8 +240,7 @@ struct Tokenize {
     uint32_t radix;
     int32_t exp_add_amt;
     bool is_exp_negative;
-    size_t char_code_index;
-    size_t char_code_end;
+    size_t xdigits_seen;
     bool unicode;
     uint32_t char_code;
     int exponent_in_bin_or_dec;
@@ -407,8 +408,44 @@ void tokenize(Buf *buf, Tokenization *out) {
     out->line_offsets = allocate<ZigList<size_t>>(1);
 
     out->line_offsets->append(0);
+    // This also jumps forward when a char literal is read
+    unsigned remaining_bytes_in_cur_char = 0;
     for (t.pos = 0; t.pos < buf_len(t.buf); t.pos += 1) {
         uint8_t c = buf_ptr(t.buf)[t.pos];
+        uint32_t cp;
+        // Reject non-ASCII, except for valid UTF-8 inside strings and comments, and utf-8 character literals
+        // Note that this peaks ahead.
+        //
+        // In the zig version of the compiler we should look at these to build something streaming and fast:
+        // https://github.com/cyb70289/utf8/
+        // https://lemire.me/blog/2018/10/19/validating-utf-8-bytes-using-only-0-45-cycles-per-byte-avx-edition/
+        if (c & 0x80 && t.state != TokenizeStateError) {
+            if (remaining_bytes_in_cur_char > 0) {
+                remaining_bytes_in_cur_char--;
+            } else if (t.state == TokenizeStateLineComment ||
+                       t.state == TokenizeStateLineString ||
+                       t.state == TokenizeStateString ||
+                       t.state == TokenizeStateCharLiteral) {
+                uint32_t state = 0;
+                unsigned i;
+                remaining_bytes_in_cur_char = utf8_skip_data[c];
+                if (buf_len(t.buf) < (t.pos + remaining_bytes_in_cur_char - 1))
+                    tokenize_error(&t, "invalid UTF-8");
+                // this is a full validator implementing finite-state-automata
+                // https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+                cp = 0;
+                for (i = 0;i < remaining_bytes_in_cur_char; i++)
+                    utf8_decode(&state, &cp, (uint8_t)buf_ptr(t.buf)[t.pos + i]);
+                if (state != UTF8_ACCEPT)
+                    tokenize_error(&t, "invalid UTF-8");
+                remaining_bytes_in_cur_char--;
+                if (t.state == TokenizeStateCharLiteral) {
+                    t.pos += remaining_bytes_in_cur_char;
+                    remaining_bytes_in_cur_char = 0;
+                }
+            } else
+                invalid_char_error(&t, c);
+        }
         switch (t.state) {
             case TokenizeStateError:
                 break;
@@ -1050,24 +1087,14 @@ void tokenize(Buf *buf, Tokenization *out) {
                         t.state = TokenizeStateCharCode;
                         t.radix = 16;
                         t.char_code = 0;
-                        t.char_code_index = 0;
-                        t.char_code_end = 2;
+                        t.xdigits_seen = 0;
                         t.unicode = false;
                         break;
                     case 'u':
-                        t.state = TokenizeStateCharCode;
-                        t.radix = 16;
-                        t.char_code = 0;
-                        t.char_code_index = 0;
-                        t.char_code_end = 4;
-                        t.unicode = true;
-                        break;
-                    case 'U':
-                        t.state = TokenizeStateCharCode;
+                        t.state = TokenizeStateCharCodeStart;
                         t.radix = 16;
                         t.char_code = 0;
-                        t.char_code_index = 0;
-                        t.char_code_end = 6;
+                        t.xdigits_seen = 0;
                         t.unicode = true;
                         break;
                     case 'n':
@@ -1092,20 +1119,35 @@ void tokenize(Buf *buf, Tokenization *out) {
                         invalid_char_error(&t, c);
                 }
                 break;
+            case TokenizeStateCharCodeStart:
+                if (c != '{')
+                    tokenize_error(&t, "expected {: '%c'", c);
+                t.state = TokenizeStateCharCode;
+                break;
             case TokenizeStateCharCode:
                 {
-                    uint32_t digit_value = get_digit_value(c);
-                    if (digit_value >= t.radix) {
-                        tokenize_error(&t, "invalid digit: '%c'", c);
-                    }
-                    t.char_code *= t.radix;
-                    t.char_code += digit_value;
-                    t.char_code_index += 1;
+                    if (c != '}') {
+                        uint32_t digit_value = get_digit_value(c);
+                        if (digit_value >= t.radix) {
+                            tokenize_error(&t, "invalid digit: '%c'", c);
+                        }
+                        t.char_code *= t.radix;
+                        t.char_code += digit_value;
+                        t.xdigits_seen += 1;
+
+                        if (t.xdigits_seen > 6)
+                            tokenize_error(&t, "expected }: '%c'", c);
+                    } else
+                        if (t.xdigits_seen % 2 != 0)
+                            tokenize_error(&t, "expected hex digit: '%c'", c);
 
-                    if (t.char_code_index >= t.char_code_end) {
+                    if (c == '}' || (!t.unicode && t.xdigits_seen == 2)) {
                         if (t.unicode) {
-                            if (t.char_code > 0x10ffff) {
-                                tokenize_error(&t, "unicode value out of range: %x", t.char_code);
+                            if (t.char_code > 0xD7FF &&
+                                t.char_code < 0xE000) {
+                                tokenize_error(&t, "unicode surrogate: 0x%x", t.char_code);
+                            } else if (t.char_code > 0x10ffff) {
+                                tokenize_error(&t, "unicode value out of range: 0x%x", t.char_code);
                             }
                             if (t.cur_tok->id == TokenIdCharLiteral) {
                                 t.cur_tok->data.char_lit.c = t.char_code;
@@ -1149,8 +1191,15 @@ void tokenize(Buf *buf, Tokenization *out) {
                     case '\\':
                         t.state = TokenizeStateStringEscape;
                         break;
+                    case '\n':
+                        tokenize_error(&t, "newline not allowed in character literal");
                     default:
-                        t.cur_tok->data.char_lit.c = c;
+                        if (c < 128)
+                            t.cur_tok->data.char_lit.c = c;
+                        else {
+                            // the utf8 parser/validator skipped forward for us and provided us this
+                            t.cur_tok->data.char_lit.c = cp;
+                        }
                         t.state = TokenizeStateCharLiteralEnd;
                         break;
                 }
@@ -1387,6 +1436,7 @@ void tokenize(Buf *buf, Tokenization *out) {
             break;
         case TokenizeStateStringEscape:
         case TokenizeStateCharCode:
+        case TokenizeStateCharCodeStart:
             if (t.cur_tok->id == TokenIdStringLiteral) {
                 tokenize_error(&t, "unterminated string");
             } else if (t.cur_tok->id == TokenIdCharLiteral) {