From eea8e77df7b19253b820236e27e23dc1c75e1013 Mon Sep 17 00:00:00 2001 From: Shawn Landden Date: Sat, 23 Mar 2019 20:49:10 -0500 Subject: [PATCH] breaking: big unicode overhaul Allow utf-8 in character literals Char Unicode escape syntax. Validate zig as UTF-8. overhaul std.unicode Fully implemented in stage2. Closes: #2097 Closes: #2129 --- About the UTF-8 validation in stage1: This implementation is quite slow, but the stage automata it claims to represent is correct, and it has two features faster validators don't that would make the code in stage1 more complicated: * They don't provide the char point * They don't provide the index of the error (although this could be hacked in, but at more cost) I don't want to put that much optimization effort into stage1 and C code. --- doc/langref.html.in | 16 +- src-self-hosted/ir.zig | 96 ++++++++++- src-self-hosted/value.zig | 21 +++ src/tokenizer.cpp | 102 +++++++++--- src/utf8-lookup.h | 55 +++++++ std/unicode.zig | 265 ++++++++++++++++++------------- std/zig/parse_string_literal.zig | 108 +++++++++++-- std/zig/parser_test.zig | 16 +- std/zig/tokenizer.zig | 79 +-------- test/stage1/behavior/misc.zig | 9 +- 10 files changed, 531 insertions(+), 236 deletions(-) create mode 100644 src/utf8-lookup.h diff --git a/doc/langref.html.in b/doc/langref.html.in index a561ebc215ce..1bb837953c11 100644 --- a/doc/langref.html.in +++ b/doc/langref.html.in @@ -522,7 +522,8 @@ test "string literals" { assert(normal_bytes.len == 5); assert(normal_bytes[1] == 'e'); assert('e' == '\x65'); - assert('\U01f4a9' == 128169); + assert('\u{01f4a9}' == 128169); + assert('๐Ÿ’ฉ' == 128169); assert(mem.eql(u8, "hello", "h\x65llo")); // A C string literal is a null terminated pointer. @@ -572,11 +573,15 @@ test "string literals" { hexadecimal 8-bit character code (2 digits) - \uNNNN + \u{NN} + hexadecimal 16-bit Unicode character code UTF-8 encoded (2 digits) + + + \u{NNNN} hexadecimal 16-bit Unicode character code UTF-8 encoded (4 digits) - \UNNNNNN + \u{NNNNNN} hexadecimal 24-bit Unicode character code UTF-8 encoded (6 digits) @@ -9491,8 +9496,9 @@ eof <- !. hex <- [0-9a-fA-F] char_escape <- "\\x" hex hex - / "\\u" hex hex hex hex - / "\\U" hex hex hex hex hex hex + / "\\u" { hex hex } + / "\\u" { hex hex hex hex } + / "\\u" { hex hex hex hex hex hex } / "\\" [nr\\t'"] char_char <- char_escape diff --git a/src-self-hosted/ir.zig b/src-self-hosted/ir.zig index 8cdac92326b2..802985c4453e 100644 --- a/src-self-hosted/ir.zig +++ b/src-self-hosted/ir.zig @@ -1147,7 +1147,10 @@ pub const Builder = struct { return irb.lvalWrap(scope, inst, lval); }, ast.Node.Id.MultilineStringLiteral => return error.Unimplemented, - ast.Node.Id.CharLiteral => return error.Unimplemented, + ast.Node.Id.CharLiteral => { + const char_lit = @fieldParentPtr(ast.Node.CharLiteral, "base", node); + return irb.lvalWrap(scope, try irb.genCharLit(char_lit, scope), lval); + }, ast.Node.Id.BoolLiteral => return error.Unimplemented, ast.Node.Id.NullLiteral => return error.Unimplemented, ast.Node.Id.UndefinedLiteral => return error.Unimplemented, @@ -1333,8 +1336,7 @@ pub const Builder = struct { ) catch |err| switch (err) { error.OutOfMemory => return error.OutOfMemory, error.InvalidBase => unreachable, - error.InvalidCharForDigit => unreachable, - error.DigitTooLargeForBase => unreachable, + error.InvalidCharacter => unreachable, }; errdefer int_val.base.deref(irb.comp); @@ -1343,18 +1345,100 @@ pub const Builder = struct { return inst; } + pub fn genCharLit(irb: *Builder, char_lit: *ast.Node.CharLiteral, scope: *Scope) !*Inst { + const char_token = irb.code.tree_scope.tree.tokenSlice(char_lit.token); + + var char: u21 = undefined; + got_char: { + if (char_token[1] == '\\') { + char = switch (char_token[2]) { + 'x' => { + const hi = charToDigit(char_token[off], 16) catch unreachable; + const lo = charToDigit(char_token[off + 1], 16) catch unreachable; + char |= ((hi << 4) | lo) << ((hex_escape_byes - 1) * 8); + break :got_char; + }, + 'u' => { + // char_token[3] == '{'; + if (char_token[6] == '}') { + hex_escape_bytes = 1; + } else if (char_token[8] == '}') { + hex_escape_bytes = 2; + } else if (char_token[10] == '}') { + hex_escape_bytes = 3; + } else { + unreachable; + } + var off: u8 = 4; + while (hex_escape_bytes > 0) : (hex_escape_bytes -= 1) { + const hi = charToDigit(char_token[off], 16) catch unreachable; + const lo = charToDigit(char_token[off + 1], 16) catch unreachable; + char |= ((hi << 4) | lo) << ((hex_escape_byes - 1) * 8); + off += 2; + } + break :got_char; + }, + 'n' => '\n', + 'r' => '\r', + '\\' => '\\', + '\t' => '\t', + '\'' => '\'', + '\"' => '\"', + else => unreachable, + }; + break :got_char; + } + // This could read one byte past the end of the file, except + // this guarantees to not read past the first character, and we + // have already validated the file as UTF-8. + _ = utf8Decode(char_token[1..4], &char); + break :got_char; + } + + const comptime_int_type = Type.ComptimeInt.get(irb.comp); + defer comptime_int_type.base.base.deref(irb.comp); + + const int_val = Value.Int.createFromCharLiteral( + irb.comp, + &comptime_int_type.base, + rest, + ) catch |err| switch (err) { + error.OutOfMemory => return error.OutOfMemory, + }; + errdefer int_val.base.deref(irb.comp); + + const inst = try irb.build(Inst.Const, scope, Span.token(char_lit.token), Inst.Const.Params{}); + inst.val = IrVal{ .KnownValue = &int_val.base }; + return inst; + } + pub async fn genStrLit(irb: *Builder, str_lit: *ast.Node.StringLiteral, scope: *Scope) !*Inst { const str_token = irb.code.tree_scope.tree.tokenSlice(str_lit.token); const src_span = Span.token(str_lit.token); var bad_index: usize = undefined; var buf = std.zig.parseStringLiteral(irb.comp.gpa(), str_token, &bad_index) catch |err| switch (err) { - error.OutOfMemory => return error.OutOfMemory, - error.InvalidCharacter => { + .OutOfMemory => return error.OutOfMemory, + .UnicodeSurrogateHalf, .UnicodeCodepointTooLarge => { + var hex_string = if (mem.indexOfScalar(u8, str_token, '}')) |i| str_token[2..i] else str_token[2..str_token.len]; + try irb.comp.addCompileError( + irb.code.tree_scope, + src_span, + "Unicode codepoint U+{} cannot be represented in UTF-16 and is invalid", + hex_string, + ); + return error.SemanticAnalysisFailed; + }, + .ExpectXDigit, .ExpectLCurly, .ExpectRCurly => { try irb.comp.addCompileError( irb.code.tree_scope, src_span, - "invalid character in string literal: '{c}'", + "expected {}, got '{c}'", + switch (err) { + .ExpectXDigit => "hexidecimal digit", + .ExpectLCurly => "left curly bracket '{'", + .ExpectRCurly => "right curly bracket '}'", + }, str_token[bad_index], ); return error.SemanticAnalysisFailed; diff --git a/src-self-hosted/value.zig b/src-self-hosted/value.zig index d8c0f7b5c87c..0a78395ecd9b 100644 --- a/src-self-hosted/value.zig +++ b/src-self-hosted/value.zig @@ -534,6 +534,27 @@ pub const Value = struct { return self; } + pub fn createFromCharLiteral(comp: *Compilation, typ: *Type, value: u21) !*Int { + const self = try comp.gpa().create(Value.Int); + self.* = Value.Int{ + .base = Value{ + .id = Value.Id.Int, + .typ = typ, + .ref_count = std.atomic.Int(usize).init(1), + }, + .big_int = undefined, + }; + typ.base.ref(); + errdefer comp.gpa().destroy(self); + + self.big_int = try std.math.big.Int.init(comp.gpa()); + errdefer self.big_int.deinit(); + + try self.big_int.set(value); + + return self; + } + pub fn getLlvmConst(self: *Int, ofile: *ObjectFile) !?*llvm.Value { switch (self.base.typ.id) { Type.Id.Int => { diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 53554d1096d0..911c334dc295 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -7,6 +7,7 @@ #include "tokenizer.hpp" #include "util.hpp" +#include "utf8-lookup.h" #include #include @@ -219,6 +220,7 @@ enum TokenizeState { TokenizeStateSawDotDot, TokenizeStateSawAtSign, TokenizeStateCharCode, + TokenizeStateCharCodeStart, TokenizeStateError, TokenizeStateLBracket, TokenizeStateLBracketStar, @@ -238,8 +240,7 @@ struct Tokenize { uint32_t radix; int32_t exp_add_amt; bool is_exp_negative; - size_t char_code_index; - size_t char_code_end; + size_t xdigits_seen; bool unicode; uint32_t char_code; int exponent_in_bin_or_dec; @@ -407,8 +408,44 @@ void tokenize(Buf *buf, Tokenization *out) { out->line_offsets = allocate>(1); out->line_offsets->append(0); + // This also jumps forward when a char literal is read + unsigned remaining_bytes_in_cur_char = 0; for (t.pos = 0; t.pos < buf_len(t.buf); t.pos += 1) { uint8_t c = buf_ptr(t.buf)[t.pos]; + uint32_t cp; + // Reject non-ASCII, except for valid UTF-8 inside strings and comments, and utf-8 character literals + // Note that this peaks ahead. + // + // In the zig version of the compiler we should look at these to build something streaming and fast: + // https://github.com/cyb70289/utf8/ + // https://lemire.me/blog/2018/10/19/validating-utf-8-bytes-using-only-0-45-cycles-per-byte-avx-edition/ + if (c & 0x80 && t.state != TokenizeStateError) { + if (remaining_bytes_in_cur_char > 0) { + remaining_bytes_in_cur_char--; + } else if (t.state == TokenizeStateLineComment || + t.state == TokenizeStateLineString || + t.state == TokenizeStateString || + t.state == TokenizeStateCharLiteral) { + uint32_t state = 0; + unsigned i; + remaining_bytes_in_cur_char = utf8_skip_data[c]; + if (buf_len(t.buf) < (t.pos + remaining_bytes_in_cur_char - 1)) + tokenize_error(&t, "invalid UTF-8"); + // this is a full validator implementing finite-state-automata + // https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + cp = 0; + for (i = 0;i < remaining_bytes_in_cur_char; i++) + utf8_decode(&state, &cp, (uint8_t)buf_ptr(t.buf)[t.pos + i]); + if (state != UTF8_ACCEPT) + tokenize_error(&t, "invalid UTF-8"); + remaining_bytes_in_cur_char--; + if (t.state == TokenizeStateCharLiteral) { + t.pos += remaining_bytes_in_cur_char; + remaining_bytes_in_cur_char = 0; + } + } else + invalid_char_error(&t, c); + } switch (t.state) { case TokenizeStateError: break; @@ -1050,24 +1087,14 @@ void tokenize(Buf *buf, Tokenization *out) { t.state = TokenizeStateCharCode; t.radix = 16; t.char_code = 0; - t.char_code_index = 0; - t.char_code_end = 2; + t.xdigits_seen = 0; t.unicode = false; break; case 'u': - t.state = TokenizeStateCharCode; - t.radix = 16; - t.char_code = 0; - t.char_code_index = 0; - t.char_code_end = 4; - t.unicode = true; - break; - case 'U': - t.state = TokenizeStateCharCode; + t.state = TokenizeStateCharCodeStart; t.radix = 16; t.char_code = 0; - t.char_code_index = 0; - t.char_code_end = 6; + t.xdigits_seen = 0; t.unicode = true; break; case 'n': @@ -1092,20 +1119,35 @@ void tokenize(Buf *buf, Tokenization *out) { invalid_char_error(&t, c); } break; + case TokenizeStateCharCodeStart: + if (c != '{') + tokenize_error(&t, "expected {: '%c'", c); + t.state = TokenizeStateCharCode; + break; case TokenizeStateCharCode: { - uint32_t digit_value = get_digit_value(c); - if (digit_value >= t.radix) { - tokenize_error(&t, "invalid digit: '%c'", c); - } - t.char_code *= t.radix; - t.char_code += digit_value; - t.char_code_index += 1; + if (c != '}') { + uint32_t digit_value = get_digit_value(c); + if (digit_value >= t.radix) { + tokenize_error(&t, "invalid digit: '%c'", c); + } + t.char_code *= t.radix; + t.char_code += digit_value; + t.xdigits_seen += 1; + + if (t.xdigits_seen > 6) + tokenize_error(&t, "expected }: '%c'", c); + } else + if (t.xdigits_seen % 2 != 0) + tokenize_error(&t, "expected hex digit: '%c'", c); - if (t.char_code_index >= t.char_code_end) { + if (c == '}' || (!t.unicode && t.xdigits_seen == 2)) { if (t.unicode) { - if (t.char_code > 0x10ffff) { - tokenize_error(&t, "unicode value out of range: %x", t.char_code); + if (t.char_code > 0xD7FF && + t.char_code < 0xE000) { + tokenize_error(&t, "unicode surrogate: 0x%x", t.char_code); + } else if (t.char_code > 0x10ffff) { + tokenize_error(&t, "unicode value out of range: 0x%x", t.char_code); } if (t.cur_tok->id == TokenIdCharLiteral) { t.cur_tok->data.char_lit.c = t.char_code; @@ -1149,8 +1191,15 @@ void tokenize(Buf *buf, Tokenization *out) { case '\\': t.state = TokenizeStateStringEscape; break; + case '\n': + tokenize_error(&t, "newline not allowed in character literal"); default: - t.cur_tok->data.char_lit.c = c; + if (c < 128) + t.cur_tok->data.char_lit.c = c; + else { + // the utf8 parser/validator skipped forward for us and provided us this + t.cur_tok->data.char_lit.c = cp; + } t.state = TokenizeStateCharLiteralEnd; break; } @@ -1387,6 +1436,7 @@ void tokenize(Buf *buf, Tokenization *out) { break; case TokenizeStateStringEscape: case TokenizeStateCharCode: + case TokenizeStateCharCodeStart: if (t.cur_tok->id == TokenIdStringLiteral) { tokenize_error(&t, "unterminated string"); } else if (t.cur_tok->id == TokenIdCharLiteral) { diff --git a/src/utf8-lookup.h b/src/utf8-lookup.h new file mode 100644 index 000000000000..df9445354f13 --- /dev/null +++ b/src/utf8-lookup.h @@ -0,0 +1,55 @@ +// Copyright (c) 2008-2009 Bjoern Hoehrmann +// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. + +//Copyright (c) 2008-2009 Bjoern Hoehrmann + +//Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +//The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +//THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#define UTF8_ACCEPT 0 +#define UTF8_REJECT 1 + +static const uint8_t utf8d[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df + 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef + 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff + 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 + 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 + 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 + 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 +}; + +uint32_t inline +utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) { + uint32_t type = utf8d[byte]; + + *codep = (*state != UTF8_ACCEPT) ? + (byte & 0x3fu) | (*codep << 6) : + (0xff >> type) & (byte); + + *state = utf8d[256 + *state*16 + type]; + return *state; +} + +// This does not validate, it just provides the length of an already-valid character +// based on the first byte. +const char utf8_skip_data[256] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,7,8, +}; diff --git a/std/unicode.zig b/std/unicode.zig index 37a73d75004b..1f4772328fb2 100644 --- a/std/unicode.zig +++ b/std/unicode.zig @@ -4,25 +4,74 @@ const assert = std.debug.assert; const testing = std.testing; const mem = std.mem; +pub const Utf8Error = UnicodeError || error{ + Utf8ShortChar, + Utf8OverlongEncoding, + Utf8InvalidStartByte, +}; + +pub const UnicodeError = error{ + UnicodeSurrogateHalf, + UnicodeCodepointTooLarge, +}; + +// http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94 +// +// Table 3-7. Well-Formed UTF-8 Byte Sequences +// +// +--------------------+------------+-------------+------------+-------------+ +// | Code Points | First Byte | Second Byte | Third Byte | Fourth Byte | +// +--------------------+------------+-------------+------------+-------------+ +// | U+0000..U+007F | 00..7F | | | | +// +--------------------+------------+-------------+------------+-------------+ +// | U+0080..U+07FF | C2..DF | 80..BF | | | +// +--------------------+------------+-------------+------------+-------------+ +// | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | +// +--------------------+------------+-------------+------------+-------------+ +// | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | +// +--------------------+------------+-------------+------------+-------------+ +// | U+D000..U+D7FF | ED | 80..9F | 80..BF | | +// +--------------------+------------+-------------+------------+-------------+ +// | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | +// +--------------------+------------+-------------+------------+-------------+ +// | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | +// +--------------------+------------+-------------+------------+-------------+ +// | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | +// +--------------------+------------+-------------+------------+-------------+ +// | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | +// +--------------------+------------+-------------+------------+-------------+ + +// This accepts u32 instead of u21 on purpose +pub fn isValidUnicode(c: u32) UnicodeError!void { + return switch (c) { + 0x0000...0xd7ff => void, + 0xd800...0xdfff => error.UnicodeSurrogateHalf, + 0xe000...0x10ffff => void, + 0x110000...0xffffffff => error.UnicodeCodepointTooLarge, + }; +} + /// Returns how many bytes the UTF-8 representation would require /// for the given codepoint. -pub fn utf8CodepointSequenceLength(c: u32) !u3 { +pub fn utf8CodepointSequenceLength(c: u21) Utf8Error!u3 { if (c < 0x80) return u3(1); if (c < 0x800) return u3(2); if (c < 0x10000) return u3(3); if (c < 0x110000) return u3(4); - return error.CodepointTooLarge; + return error.UnicodeCodepointTooLarge; } /// Given the first byte of a UTF-8 codepoint, /// returns a number 1-4 indicating the total length of the codepoint in bytes. /// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte. -pub fn utf8ByteSequenceLength(first_byte: u8) !u3 { - if (first_byte < 0b10000000) return u3(1); - if (first_byte & 0b11100000 == 0b11000000) return u3(2); - if (first_byte & 0b11110000 == 0b11100000) return u3(3); - if (first_byte & 0b11111000 == 0b11110000) return u3(4); - return error.Utf8InvalidStartByte; +pub fn utf8ByteSequenceLength(first_byte: u8) Utf8Error!u3 { + const INVALID = 0; + const swtch = []u8{1, INVALID, 2, 3, 4, INVALID, INVALID, INVALID, INVALID}; + var len = swtch[@clz(~first_byte)]; + if (len == INVALID) { + return error.Utf8InvalidStartByte; + } + return @intCast(u3, len); } /// Encodes the given codepoint into a UTF-8 byte sequence. @@ -30,7 +79,7 @@ pub fn utf8ByteSequenceLength(first_byte: u8) !u3 { /// out: the out buffer to write to. Must have a len >= utf8CodepointSequenceLength(c). /// Errors: if c cannot be encoded in UTF-8. /// Returns: the number of bytes written to out. -pub fn utf8Encode(c: u32, out: []u8) !u3 { +pub fn utf8Encode(c: u21, out: []u8) Utf8Error!u3 { const length = try utf8CodepointSequenceLength(c); assert(out.len >= length); switch (length) { @@ -44,7 +93,7 @@ pub fn utf8Encode(c: u32, out: []u8) !u3 { out[1] = @intCast(u8, 0b10000000 | (c & 0b111111)); }, 3 => { - if (0xd800 <= c and c <= 0xdfff) return error.Utf8CannotEncodeSurrogateHalf; + if (0xd800 <= c and c <= 0xdfff) return error.UnicodeSurrogateHalf; out[0] = @intCast(u8, 0b11100000 | (c >> 12)); out[1] = @intCast(u8, 0b10000000 | ((c >> 6) & 0b111111)); out[2] = @intCast(u8, 0b10000000 | (c & 0b111111)); @@ -60,32 +109,36 @@ pub fn utf8Encode(c: u32, out: []u8) !u3 { return length; } -const Utf8DecodeError = Utf8Decode2Error || Utf8Decode3Error || Utf8Decode4Error; - -/// Decodes the UTF-8 codepoint encoded in the given slice of bytes. -/// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable. -/// If you already know the length at comptime, you can call one of -/// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function. -pub fn utf8Decode(bytes: []const u8) Utf8DecodeError!u32 { - return switch (bytes.len) { +/// Decodes the UTF-8 codepoint encoded in the given slice of bytes and returns +/// then length of the character decoded. +/// +/// Guaranteed to not read bytes past this character. +/// +/// "ret" cannot be *u21 because when casting to *u32 it would have differn't +/// behavior on Little-Endian and Big-Endian machines, which is too much to ask +/// of our callers. +/// https://github.com/ziglang/zig/issues/2136 +pub fn utf8Decode(bytes: []const u8, ret: *align(4) u32) Utf8Error!u3 { + var len = try utf8ByteSequenceLength(bytes[0]); + if (bytes.len < len) { + return error.Utf8ShortChar; + } + ret.* = switch (len) { 1 => u32(bytes[0]), - 2 => utf8Decode2(bytes), - 3 => utf8Decode3(bytes), - 4 => utf8Decode4(bytes), + 2 => try utf8Decode2(bytes[0..2]), + 3 => try utf8Decode3(bytes[0..3]), + 4 => try utf8Decode4(bytes[0..4]), else => unreachable, }; + return len; } -const Utf8Decode2Error = error{ - Utf8ExpectedContinuation, - Utf8OverlongEncoding, -}; -pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u32 { +pub fn utf8Decode2(bytes: []const u8) Utf8Error!u32 { assert(bytes.len == 2); assert(bytes[0] & 0b11100000 == 0b11000000); var value: u32 = bytes[0] & 0b00011111; - if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; + if (@clz(~bytes[1]) != 1) return error.Utf8ShortChar; value <<= 6; value |= bytes[1] & 0b00111111; @@ -94,74 +147,67 @@ pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u32 { return value; } -const Utf8Decode3Error = error{ - Utf8ExpectedContinuation, - Utf8OverlongEncoding, - Utf8EncodesSurrogateHalf, -}; -pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u32 { +pub fn utf8Decode3(bytes: []const u8) Utf8Error!u32 { assert(bytes.len == 3); assert(bytes[0] & 0b11110000 == 0b11100000); var value: u32 = bytes[0] & 0b00001111; - if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; + if (@clz(~bytes[1]) != 1) return error.Utf8ShortChar; value <<= 6; value |= bytes[1] & 0b00111111; - if (bytes[2] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; + if (@clz(~bytes[2]) != 1) return error.Utf8ShortChar; value <<= 6; value |= bytes[2] & 0b00111111; if (value < 0x800) return error.Utf8OverlongEncoding; - if (0xd800 <= value and value <= 0xdfff) return error.Utf8EncodesSurrogateHalf; + if (0xd800 <= value and value <= 0xdfff) return error.UnicodeSurrogateHalf; return value; } -const Utf8Decode4Error = error{ - Utf8ExpectedContinuation, - Utf8OverlongEncoding, - Utf8CodepointTooLarge, -}; -pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u32 { +pub fn utf8Decode4(bytes: []const u8) Utf8Error!u32 { assert(bytes.len == 4); assert(bytes[0] & 0b11111000 == 0b11110000); var value: u32 = bytes[0] & 0b00000111; - if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; + if (@clz(~bytes[2]) != 1) return error.Utf8ShortChar; value <<= 6; value |= bytes[1] & 0b00111111; - if (bytes[2] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; + if (@clz(~bytes[2]) != 1) return error.Utf8ShortChar; value <<= 6; value |= bytes[2] & 0b00111111; - if (bytes[3] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; + if (@clz(~bytes[2]) != 1) return error.Utf8ShortChar; value <<= 6; value |= bytes[3] & 0b00111111; if (value < 0x10000) return error.Utf8OverlongEncoding; - if (value > 0x10FFFF) return error.Utf8CodepointTooLarge; + if (value > 0x10FFFF) return error.UnicodeCodepointTooLarge; return value; } -pub fn utf8ValidateSlice(s: []const u8) bool { +// TODO replace with something faster: +// https://github.com/cyb70289/utf8/ +// https://lemire.me/blog/2018/10/19/validating-utf-8-bytes-using-only-0-45-cycles-per-byte-avx-edition/ +pub fn utf8ValidateSliceWithLoc(s: []const u8, ret_invalid_maybe: ?*usize) Utf8Error!void { var i: usize = 0; while (i < s.len) { - if (utf8ByteSequenceLength(s[i])) |cp_len| { - if (i + cp_len > s.len) { - return false; + var c: u32 = undefined; + i += utf8Decode(s[i..], &c) catch |err| { + if (ret_invalid_maybe) |ret_invalid| { + ret_invalid.* = i; } - - if (utf8Decode(s[i .. i + cp_len])) |_| {} else |_| { - return false; - } - i += cp_len; - } else |err| { - return false; - } + return err; + }; } + return; +} + +pub fn utf8ValidateSlice(s: []const u8) bool { + utf8ValidateSliceWithLoc(s, null) catch return false; return true; } @@ -177,11 +223,9 @@ pub const Utf8View = struct { bytes: []const u8, pub fn init(s: []const u8) !Utf8View { - if (!utf8ValidateSlice(s)) { - return error.InvalidUtf8; - } - - return initUnchecked(s); + if (utf8ValidateSlice(s)) { + return initUnchecked(s); + } else return error.InvalidUtf8; } pub fn initUnchecked(s: []const u8) Utf8View { @@ -192,11 +236,9 @@ pub const Utf8View = struct { pub fn initComptime(comptime s: []const u8) Utf8View { if (comptime init(s)) |r| { return r; - } else |err| switch (err) { - error.InvalidUtf8 => { - @compileError("invalid utf8"); - unreachable; - }, + } else |err| { + @compileError("invalid utf8"); + unreachable; } } @@ -217,21 +259,19 @@ pub const Utf8Iterator = struct { return null; } - const cp_len = utf8ByteSequenceLength(it.bytes[it.i]) catch unreachable; + const cp_len = utf8ByteSequenceLength(it.bytes[it.i]) catch null; it.i += cp_len; return it.bytes[it.i - cp_len .. it.i]; } - pub fn nextCodepoint(it: *Utf8Iterator) ?u32 { - const slice = it.nextCodepointSlice() orelse return null; - - switch (slice.len) { - 1 => return u32(slice[0]), - 2 => return utf8Decode2(slice) catch unreachable, - 3 => return utf8Decode3(slice) catch unreachable, - 4 => return utf8Decode4(slice) catch unreachable, - else => unreachable, + pub fn nextCodepoint(it: *Utf8Iterator) ?u21 { + if (it.i >= it.bytes.len) { + return null; } + + var c: u32 = undefined; + it.i += utf8Decode(it.bytes[it.i..], &c) catch return null; + return @intCast(u21, c); } }; @@ -246,7 +286,7 @@ pub const Utf16LeIterator = struct { }; } - pub fn nextCodepoint(it: *Utf16LeIterator) !?u32 { + pub fn nextCodepoint(it: *Utf16LeIterator) !?u21 { assert(it.i <= it.bytes.len); if (it.i == it.bytes.len) return null; const c0: u32 = mem.readIntSliceLittle(u16, it.bytes[it.i .. it.i + 2]); @@ -257,12 +297,12 @@ pub const Utf16LeIterator = struct { const c1: u32 = mem.readIntSliceLittle(u16, it.bytes[it.i .. it.i + 2]); if (c1 & ~u32(0x03ff) != 0xdc00) return error.ExpectedSecondSurrogateHalf; it.i += 2; - return 0x10000 + (((c0 & 0x03ff) << 10) | (c1 & 0x03ff)); + return @truncate(u21, 0x10000 + (((c0 & 0x03ff) << 10) | (c1 & 0x03ff))); } else if (c0 & ~u32(0x03ff) == 0xdc00) { return error.UnexpectedSecondSurrogateHalf; } else { it.i += 2; - return c0; + return @truncate(u21, c0); } } }; @@ -274,19 +314,19 @@ test "utf8 encode" { fn testUtf8Encode() !void { // A few taken from wikipedia a few taken elsewhere var array: [4]u8 = undefined; - testing.expect((try utf8Encode(try utf8Decode("โ‚ฌ"), array[0..])) == 3); + testing.expect((try utf8Encode('โ‚ฌ', array[0..])) == 3); testing.expect(array[0] == 0b11100010); testing.expect(array[1] == 0b10000010); testing.expect(array[2] == 0b10101100); - testing.expect((try utf8Encode(try utf8Decode("$"), array[0..])) == 1); + testing.expect((try utf8Encode('$', array[0..])) == 1); testing.expect(array[0] == 0b00100100); - testing.expect((try utf8Encode(try utf8Decode("ยข"), array[0..])) == 2); + testing.expect((try utf8Encode('ยข', array[0..])) == 2); testing.expect(array[0] == 0b11000010); testing.expect(array[1] == 0b10100010); - testing.expect((try utf8Encode(try utf8Decode("๐ˆ"), array[0..])) == 4); + testing.expect((try utf8Encode('๐ˆ', array[0..])) == 4); testing.expect(array[0] == 0b11110000); testing.expect(array[1] == 0b10010000); testing.expect(array[2] == 0b10001101); @@ -299,13 +339,12 @@ test "utf8 encode error" { } fn testUtf8EncodeError() void { var array: [4]u8 = undefined; - testErrorEncode(0xd800, array[0..], error.Utf8CannotEncodeSurrogateHalf); - testErrorEncode(0xdfff, array[0..], error.Utf8CannotEncodeSurrogateHalf); - testErrorEncode(0x110000, array[0..], error.CodepointTooLarge); - testErrorEncode(0xffffffff, array[0..], error.CodepointTooLarge); + testErrorEncode(0xd800, array[0..], error.UnicodeSurrogateHalf); + testErrorEncode(0xdfff, array[0..], error.UnicodeSurrogateHalf); + testErrorEncode(0x110000, array[0..], error.UnicodeCodepointTooLarge); } -fn testErrorEncode(codePoint: u32, array: []u8, expectedErr: anyerror) void { +fn testErrorEncode(codePoint: u21, array: []u8, expectedErr: anyerror) void { testing.expectError(expectedErr, utf8Encode(codePoint, array)); } @@ -401,24 +440,24 @@ fn testInvalidUtf8ContinuationBytes() void { testError("\xf8", error.Utf8InvalidStartByte); testError("\xff", error.Utf8InvalidStartByte); // expected continuation for 2 byte sequences - testError("\xc2", error.UnexpectedEof); - testError("\xc2\x00", error.Utf8ExpectedContinuation); - testError("\xc2\xc0", error.Utf8ExpectedContinuation); + testError("\xc2", error.Utf8ShortChar); + testError("\xc2\x00", error.Utf8ShortChar); + testError("\xc2\xc0", error.Utf8ShortChar); // expected continuation for 3 byte sequences - testError("\xe0", error.UnexpectedEof); - testError("\xe0\x00", error.UnexpectedEof); - testError("\xe0\xc0", error.UnexpectedEof); - testError("\xe0\xa0", error.UnexpectedEof); - testError("\xe0\xa0\x00", error.Utf8ExpectedContinuation); - testError("\xe0\xa0\xc0", error.Utf8ExpectedContinuation); + testError("\xe0", error.Utf8ShortChar); + testError("\xe0\x00", error.Utf8ShortChar); + testError("\xe0\xc0", error.Utf8ShortChar); + testError("\xe0\xa0", error.Utf8ShortChar); + testError("\xe0\xa0\x00", error.Utf8ShortChar); + testError("\xe0\xa0\xc0", error.Utf8ShortChar); // expected continuation for 4 byte sequences - testError("\xf0", error.UnexpectedEof); - testError("\xf0\x00", error.UnexpectedEof); - testError("\xf0\xc0", error.UnexpectedEof); - testError("\xf0\x90\x00", error.UnexpectedEof); - testError("\xf0\x90\xc0", error.UnexpectedEof); - testError("\xf0\x90\x80\x00", error.Utf8ExpectedContinuation); - testError("\xf0\x90\x80\xc0", error.Utf8ExpectedContinuation); + testError("\xf0", error.Utf8ShortChar); + testError("\xf0\x00", error.Utf8ShortChar); + testError("\xf0\xc0", error.Utf8ShortChar); + testError("\xf0\x90\x00", error.Utf8ShortChar); + testError("\xf0\x90\xc0", error.Utf8ShortChar); + testError("\xf0\x90\x80\x00", error.Utf8ShortChar); + testError("\xf0\x90\x80\xc0", error.Utf8ShortChar); } test "overlong utf8 codepoint" { @@ -440,12 +479,12 @@ test "misc invalid utf8" { } fn testMiscInvalidUtf8() void { // codepoint out of bounds - testError("\xf4\x90\x80\x80", error.Utf8CodepointTooLarge); - testError("\xf7\xbf\xbf\xbf", error.Utf8CodepointTooLarge); + testError("\xf4\x90\x80\x80", error.UnicodeCodepointTooLarge); + testError("\xf7\xbf\xbf\xbf", error.UnicodeCodepointTooLarge); // surrogate halves testValid("\xed\x9f\xbf", 0xd7ff); - testError("\xed\xa0\x80", error.Utf8EncodesSurrogateHalf); - testError("\xed\xbf\xbf", error.Utf8EncodesSurrogateHalf); + testError("\xed\xa0\x80", error.UnicodeSurrogateHalf); + testError("\xed\xbf\xbf", error.UnicodeSurrogateHalf); testValid("\xee\x80\x80", 0xe000); } @@ -459,9 +498,11 @@ fn testValid(bytes: []const u8, expected_codepoint: u32) void { fn testDecode(bytes: []const u8) !u32 { const length = try utf8ByteSequenceLength(bytes[0]); - if (bytes.len < length) return error.UnexpectedEof; + if (bytes.len < length) return error.Utf8ShortChar; testing.expect(bytes.len == length); - return utf8Decode(bytes); + var c: u32 = undefined; + _ = try utf8Decode(bytes, &c); + return c; } /// Caller must free returned memory. diff --git a/std/zig/parse_string_literal.zig b/std/zig/parse_string_literal.zig index acae0b64c79c..4ec32242a169 100644 --- a/std/zig/parse_string_literal.zig +++ b/std/zig/parse_string_literal.zig @@ -1,16 +1,94 @@ -const std = @import("../std.zig"); +const std = @import("std");//("../std.zig"); const assert = std.debug.assert; +const mem = std.mem; +const unicode = std.unicode; +const ascii = std.ascii; + +pub const ParseEscapeError = std.unicode.UnicodeError || error{ + ExpectXDigit, + ExpectLCurly, + ExpectRCurly, +}; +inline fn parseEscape(escape_sequence: []const u8, ret_len: *u4) ParseEscapeError!u21 { + var ret: u21 = undefined; + var it = mem.byteIterator(escape_sequence); + errdefer ret_len.* = it.i; + got_escape: { switch (it.n()) { + 'x' => { + const hi = ascii.charToDigit(it.n(), 16) catch return error.ExpectXDigit; + const lo = ascii.charToDigit(it.n(), 16) catch return error.ExpectXDigit; + ret_len.* = 3; + return u21(((hi << 4) | lo)); + }, + 'u' => { + if (it.n() != '{') return error.ExpectLCurly; + const hi = ascii.charToDigit(it.n(), 16) catch return error.ExpectXDigit; + const lo = ascii.charToDigit(it.n(), 16) catch return error.ExpectXDigit; + ret_len.* = 4; + ret = (hi << 4) | lo; + hi = ascii.charToDigit(it.n(), 16) catch { + if (it.n() != '}') return error.ExpectRCurly; + ret_len.* = 5; + break :got_escape; + }; + lo = ascii.charToDigit(it.n(), 16) catch return error.ExpectXDigit; + ret_len.* = 6; + ret |= ((hi << 4) | lo) << 8; + hi = ascii.charToDigit(it.n(), 16) catch { + if (it.n() != '}') return error.ExpectRCurly; + ret_len.* = 7; + break :got_escape; + }; + lo = ascii.charToDigit(it.n(), 16) catch return error.ExpectXDigit; + ret_len.* = 8; + ret |= ((hi << 4) | lo) << 16; + if (it.n() != '}') return error.ExpectRCurly; + ret_len.* = 9; + }, + else => unreachable, + }} + unicode.isValidUnicode(ret) catch |err| return err; + return ret; +} + +pub const ParseCharLiteralError = ParseEscapeError || error{ + ExpectSQuote, +}; +pub fn parseCharLiteral(char_token: []const u8) ParseCharLiteralError!u21 { + var char: u21 = undefined; + if (char_token[1] == '\\') { + var len: u4 = undefined; + char = switch (char_token[2]) { + 'x', 'u' => try parseEscape(char_token[2..], &len), + 'n' => '\n', + 'r' => '\r', + '\\' => '\\', + '\t' => '\t', + '\'' => '\'', + '\"' => '\"', + else => unreachable, + }; + if (char_token[2 + len] != '}') return error.ExpectRCurly; + } + char = char_token[2]; + if (char_token[3] != '\'') return error.ExpectSQuote; + + return char; +} + +test "zig.parseCharLiteral" { + const expect = std.testing.expect; + expect(parseCharLiteral("\'0\'") catch unreachable == '0'); + expect(parseCharLiteral("\'\x20\'") catch unreachable == ' '); +} const State = enum { Start, Backslash, }; -pub const ParseStringLiteralError = error{ +pub const ParseStringLiteralError = ParseEscapeError || error{ OutOfMemory, - - /// When this is returned, index will be the position of the character. - InvalidCharacter, }; /// caller owns returned memory @@ -29,7 +107,9 @@ pub fn parseStringLiteral( try list.ensureCapacity(slice.len - 1); var state = State.Start; - for (slice) |b, index| { + var index: usize = 0; + while (index < slice.len) : (index += 1) { + var b = slice[index]; switch (state) { State.Start => switch (b) { '\\' => state = State.Backslash, @@ -41,9 +121,15 @@ pub fn parseStringLiteral( else => try list.append(b), }, State.Backslash => switch (b) { - 'x' => @panic("TODO"), - 'u' => @panic("TODO"), - 'U' => @panic("TODO"), + 'x', 'u' => { + var encoded: [4]u8 = undefined; + var len: u3 = undefined; + bad_index.* = index; + len = unicode.utf8Encode(try parseEscape(char_token[2..], &len), encoded[0..]) catch unreachable; + try list.appendSlice(encoded[0..len]); + index += len; + state = State.Start; + }, 'n' => { try list.append('\n'); state = State.Start; @@ -64,6 +150,10 @@ pub fn parseStringLiteral( try list.append('"'); state = State.Start; }, + '\'' => { + try list.append('\''); + state = State.Start; + }, else => { bad_index.* = index; return error.InvalidCharacter; diff --git a/std/zig/parser_test.zig b/std/zig/parser_test.zig index c72d62f99a3b..8cca64e5f748 100644 --- a/std/zig/parser_test.zig +++ b/std/zig/parser_test.zig @@ -12,9 +12,21 @@ test "zig fmt: enum literal" { ); } -test "zig fmt: character literal larger than u8" { +test "zig fmt: character literals" { try testCanonical( - \\const x = '\U01f4a9'; + \\const x = '\x80'; + \\ + ); + try testCanonical( + \\const x = '\u{80}'; + \\ + ); + try testCanonical( + \\const x = '\u{01f4}'; + \\ + ); + try testCanonical( + \\const x = '\u{01f4a9}'; \\ ); } diff --git a/std/zig/tokenizer.zig b/std/zig/tokenizer.zig index 2ace430a15fd..0a5f489fa654 100644 --- a/std/zig/tokenizer.zig +++ b/std/zig/tokenizer.zig @@ -1,5 +1,6 @@ const std = @import("../std.zig"); const mem = std.mem; +const unicode = std.unicode; pub const Token = struct { id: Id, @@ -234,12 +235,8 @@ pub const Tokenizer = struct { Builtin, C, StringLiteral, - StringLiteralBackslash, MultilineStringLiteralLine, CharLiteral, - CharLiteralBackslash, - CharLiteralHexEscape, - CharLiteralEnd, Backslash, Equal, Bang, @@ -619,90 +616,28 @@ pub const Tokenizer = struct { else => break, }, State.StringLiteral => switch (c) { - '\\' => { - state = State.StringLiteralBackslash; - }, '"' => { self.index += 1; break; }, - '\n' => break, // Look for this error later. - else => self.checkLiteralCharacter(), - }, - - State.StringLiteralBackslash => switch (c) { - '\n' => break, // Look for this error later. - else => { - state = State.StringLiteral; - }, - }, - - State.CharLiteral => switch (c) { - '\\' => { - state = State.CharLiteralBackslash; - }, - '\'' => { - result.id = Token.Id.Invalid; - break; - }, - else => { - if (c < 0x20 or c == 0x7f) { - result.id = Token.Id.Invalid; - break; - } - - state = State.CharLiteralEnd; - }, - }, - - State.CharLiteralBackslash => switch (c) { '\n' => { result.id = Token.Id.Invalid; break; }, - 'x' => { - state = State.CharLiteralHexEscape; - seen_escape_digits = 0; - expected_escape_digits = 2; - }, - 'u' => { - state = State.CharLiteralHexEscape; - seen_escape_digits = 0; - expected_escape_digits = 4; - }, - 'U' => { - state = State.CharLiteralHexEscape; - seen_escape_digits = 0; - expected_escape_digits = 6; - }, - else => { - state = State.CharLiteralEnd; - }, - }, - - State.CharLiteralHexEscape => switch (c) { - '0'...'9', 'a'...'z', 'A'...'F' => { - seen_escape_digits += 1; - if (seen_escape_digits == expected_escape_digits) { - state = State.CharLiteralEnd; - } - }, - else => { - result.id = Token.Id.Invalid; - break; - }, + else => self.checkLiteralCharacter(), }, - State.CharLiteralEnd => switch (c) { + State.CharLiteral => switch (c) { '\'' => { result.id = Token.Id.CharLiteral; self.index += 1; break; }, - else => { + '\n' => { result.id = Token.Id.Invalid; break; }, + else => self.checkLiteralCharacter(), }, State.MultilineStringLiteralLine => switch (c) { @@ -1052,10 +987,6 @@ pub const Tokenizer = struct { State.SawAtSign, State.Backslash, State.CharLiteral, - State.CharLiteralBackslash, - State.CharLiteralHexEscape, - State.CharLiteralEnd, - State.StringLiteralBackslash, State.LBracketStar, State.LBracketStarC, => { diff --git a/test/stage1/behavior/misc.zig b/test/stage1/behavior/misc.zig index fd407821e678..c6e844ee4e81 100644 --- a/test/stage1/behavior/misc.zig +++ b/test/stage1/behavior/misc.zig @@ -190,7 +190,7 @@ test "string escapes" { expect(mem.eql(u8, "\r", "\x0d")); expect(mem.eql(u8, "\t", "\x09")); expect(mem.eql(u8, "\\", "\x5c")); - expect(mem.eql(u8, "\u1234\u0069", "\xe1\x88\xb4\x69")); + expect(mem.eql(u8, "\u{1234}\u{0069}", "\xe1\x88\xb4\x69")); } test "multiline string" { @@ -701,6 +701,11 @@ test "thread local variable" { } test "unicode escape in character literal" { - var a: u24 = '\U01f4a9'; + var a: u24 = '\u{01f4a9}'; + expect(a == 128169); +} + +test "utf-8 in character literal" { + var a: u24 = '๐Ÿ’ฉ'; expect(a == 128169); }