Skip to content

Commit

Permalink
breaking: big unicode overhaul
Browse files Browse the repository at this point in the history
Allow utf-8 in character literals

Char Unicode escape syntax.

Validate zig as UTF-8.

overhaul std.unicode

Fully implemented in stage2.

Closes: ziglang#2097
Closes: ziglang#2129
---

About the UTF-8 validation in stage1: This implementation is quite slow,
but the stage automata it claims to represent is correct,
and it has two features faster validators don't that would
make the code in stage1 more complicated:

* They don't provide the char point
* They don't provide the index of the error (although this could be
  hacked in, but at more cost)

I don't want to put that much optimization effort into stage1 and C
code.
  • Loading branch information
shawnl committed Apr 1, 2019
1 parent c5ee98e commit eea8e77
Show file tree
Hide file tree
Showing 10 changed files with 531 additions and 236 deletions.
16 changes: 11 additions & 5 deletions doc/langref.html.in
Expand Up @@ -522,7 +522,8 @@ test "string literals" {
assert(normal_bytes.len == 5);
assert(normal_bytes[1] == 'e');
assert('e' == '\x65');
assert('\U01f4a9' == 128169);
assert('\u{01f4a9}' == 128169);
assert('💩' == 128169);
assert(mem.eql(u8, "hello", "h\x65llo"));

// A C string literal is a null terminated pointer.
Expand Down Expand Up @@ -572,11 +573,15 @@ test "string literals" {
<td>hexadecimal 8-bit character code (2 digits)</td>
</tr>
<tr>
<td><code>\uNNNN</code></td>
<td><code>\u{NN}</code></td>
<td>hexadecimal 16-bit Unicode character code UTF-8 encoded (2 digits)</td>
</tr>
<tr>
<td><code>\u{NNNN}</code></td>
<td>hexadecimal 16-bit Unicode character code UTF-8 encoded (4 digits)</td>
</tr>
<tr>
<td><code>\UNNNNNN</code></td>
<td><code>\u{NNNNNN}</code></td>
<td>hexadecimal 24-bit Unicode character code UTF-8 encoded (6 digits)</td>
</tr>
</table>
Expand Down Expand Up @@ -9491,8 +9496,9 @@ eof &lt;- !.
hex &lt;- [0-9a-fA-F]
char_escape
&lt;- "\\x" hex hex
/ "\\u" hex hex hex hex
/ "\\U" hex hex hex hex hex hex
/ "\\u" { hex hex }
/ "\\u" { hex hex hex hex }
/ "\\u" { hex hex hex hex hex hex }
/ "\\" [nr\\t'"]
char_char
&lt;- char_escape
Expand Down
96 changes: 90 additions & 6 deletions src-self-hosted/ir.zig
Expand Up @@ -1147,7 +1147,10 @@ pub const Builder = struct {
return irb.lvalWrap(scope, inst, lval);
},
ast.Node.Id.MultilineStringLiteral => return error.Unimplemented,
ast.Node.Id.CharLiteral => return error.Unimplemented,
ast.Node.Id.CharLiteral => {
const char_lit = @fieldParentPtr(ast.Node.CharLiteral, "base", node);
return irb.lvalWrap(scope, try irb.genCharLit(char_lit, scope), lval);
},
ast.Node.Id.BoolLiteral => return error.Unimplemented,
ast.Node.Id.NullLiteral => return error.Unimplemented,
ast.Node.Id.UndefinedLiteral => return error.Unimplemented,
Expand Down Expand Up @@ -1333,8 +1336,7 @@ pub const Builder = struct {
) catch |err| switch (err) {
error.OutOfMemory => return error.OutOfMemory,
error.InvalidBase => unreachable,
error.InvalidCharForDigit => unreachable,
error.DigitTooLargeForBase => unreachable,
error.InvalidCharacter => unreachable,
};
errdefer int_val.base.deref(irb.comp);

Expand All @@ -1343,18 +1345,100 @@ pub const Builder = struct {
return inst;
}

pub fn genCharLit(irb: *Builder, char_lit: *ast.Node.CharLiteral, scope: *Scope) !*Inst {
const char_token = irb.code.tree_scope.tree.tokenSlice(char_lit.token);

var char: u21 = undefined;
got_char: {
if (char_token[1] == '\\') {
char = switch (char_token[2]) {
'x' => {
const hi = charToDigit(char_token[off], 16) catch unreachable;
const lo = charToDigit(char_token[off + 1], 16) catch unreachable;
char |= ((hi << 4) | lo) << ((hex_escape_byes - 1) * 8);
break :got_char;
},
'u' => {
// char_token[3] == '{';
if (char_token[6] == '}') {
hex_escape_bytes = 1;
} else if (char_token[8] == '}') {
hex_escape_bytes = 2;
} else if (char_token[10] == '}') {
hex_escape_bytes = 3;
} else {
unreachable;
}
var off: u8 = 4;
while (hex_escape_bytes > 0) : (hex_escape_bytes -= 1) {
const hi = charToDigit(char_token[off], 16) catch unreachable;
const lo = charToDigit(char_token[off + 1], 16) catch unreachable;
char |= ((hi << 4) | lo) << ((hex_escape_byes - 1) * 8);
off += 2;
}
break :got_char;
},
'n' => '\n',
'r' => '\r',
'\\' => '\\',
'\t' => '\t',
'\'' => '\'',
'\"' => '\"',
else => unreachable,
};
break :got_char;
}
// This could read one byte past the end of the file, except
// this guarantees to not read past the first character, and we
// have already validated the file as UTF-8.
_ = utf8Decode(char_token[1..4], &char);
break :got_char;
}

const comptime_int_type = Type.ComptimeInt.get(irb.comp);
defer comptime_int_type.base.base.deref(irb.comp);

const int_val = Value.Int.createFromCharLiteral(
irb.comp,
&comptime_int_type.base,
rest,
) catch |err| switch (err) {
error.OutOfMemory => return error.OutOfMemory,
};
errdefer int_val.base.deref(irb.comp);

const inst = try irb.build(Inst.Const, scope, Span.token(char_lit.token), Inst.Const.Params{});
inst.val = IrVal{ .KnownValue = &int_val.base };
return inst;
}

pub async fn genStrLit(irb: *Builder, str_lit: *ast.Node.StringLiteral, scope: *Scope) !*Inst {
const str_token = irb.code.tree_scope.tree.tokenSlice(str_lit.token);
const src_span = Span.token(str_lit.token);

var bad_index: usize = undefined;
var buf = std.zig.parseStringLiteral(irb.comp.gpa(), str_token, &bad_index) catch |err| switch (err) {
error.OutOfMemory => return error.OutOfMemory,
error.InvalidCharacter => {
.OutOfMemory => return error.OutOfMemory,
.UnicodeSurrogateHalf, .UnicodeCodepointTooLarge => {
var hex_string = if (mem.indexOfScalar(u8, str_token, '}')) |i| str_token[2..i] else str_token[2..str_token.len];
try irb.comp.addCompileError(
irb.code.tree_scope,
src_span,
"Unicode codepoint U+{} cannot be represented in UTF-16 and is invalid",
hex_string,
);
return error.SemanticAnalysisFailed;
},
.ExpectXDigit, .ExpectLCurly, .ExpectRCurly => {
try irb.comp.addCompileError(
irb.code.tree_scope,
src_span,
"invalid character in string literal: '{c}'",
"expected {}, got '{c}'",
switch (err) {
.ExpectXDigit => "hexidecimal digit",
.ExpectLCurly => "left curly bracket '{'",
.ExpectRCurly => "right curly bracket '}'",
},
str_token[bad_index],
);
return error.SemanticAnalysisFailed;
Expand Down
21 changes: 21 additions & 0 deletions src-self-hosted/value.zig
Expand Up @@ -534,6 +534,27 @@ pub const Value = struct {
return self;
}

pub fn createFromCharLiteral(comp: *Compilation, typ: *Type, value: u21) !*Int {
const self = try comp.gpa().create(Value.Int);
self.* = Value.Int{
.base = Value{
.id = Value.Id.Int,
.typ = typ,
.ref_count = std.atomic.Int(usize).init(1),
},
.big_int = undefined,
};
typ.base.ref();
errdefer comp.gpa().destroy(self);

self.big_int = try std.math.big.Int.init(comp.gpa());
errdefer self.big_int.deinit();

try self.big_int.set(value);

return self;
}

pub fn getLlvmConst(self: *Int, ofile: *ObjectFile) !?*llvm.Value {
switch (self.base.typ.id) {
Type.Id.Int => {
Expand Down
102 changes: 76 additions & 26 deletions src/tokenizer.cpp
Expand Up @@ -7,6 +7,7 @@

#include "tokenizer.hpp"
#include "util.hpp"
#include "utf8-lookup.h"

#include <stdarg.h>
#include <stdlib.h>
Expand Down Expand Up @@ -219,6 +220,7 @@ enum TokenizeState {
TokenizeStateSawDotDot,
TokenizeStateSawAtSign,
TokenizeStateCharCode,
TokenizeStateCharCodeStart,
TokenizeStateError,
TokenizeStateLBracket,
TokenizeStateLBracketStar,
Expand All @@ -238,8 +240,7 @@ struct Tokenize {
uint32_t radix;
int32_t exp_add_amt;
bool is_exp_negative;
size_t char_code_index;
size_t char_code_end;
size_t xdigits_seen;
bool unicode;
uint32_t char_code;
int exponent_in_bin_or_dec;
Expand Down Expand Up @@ -407,8 +408,44 @@ void tokenize(Buf *buf, Tokenization *out) {
out->line_offsets = allocate<ZigList<size_t>>(1);

out->line_offsets->append(0);
// This also jumps forward when a char literal is read
unsigned remaining_bytes_in_cur_char = 0;
for (t.pos = 0; t.pos < buf_len(t.buf); t.pos += 1) {
uint8_t c = buf_ptr(t.buf)[t.pos];
uint32_t cp;
// Reject non-ASCII, except for valid UTF-8 inside strings and comments, and utf-8 character literals
// Note that this peaks ahead.
//
// In the zig version of the compiler we should look at these to build something streaming and fast:
// https://github.com/cyb70289/utf8/
// https://lemire.me/blog/2018/10/19/validating-utf-8-bytes-using-only-0-45-cycles-per-byte-avx-edition/
if (c & 0x80 && t.state != TokenizeStateError) {
if (remaining_bytes_in_cur_char > 0) {
remaining_bytes_in_cur_char--;
} else if (t.state == TokenizeStateLineComment ||
t.state == TokenizeStateLineString ||
t.state == TokenizeStateString ||
t.state == TokenizeStateCharLiteral) {
uint32_t state = 0;
unsigned i;
remaining_bytes_in_cur_char = utf8_skip_data[c];
if (buf_len(t.buf) < (t.pos + remaining_bytes_in_cur_char - 1))
tokenize_error(&t, "invalid UTF-8");
// this is a full validator implementing finite-state-automata
// https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
cp = 0;
for (i = 0;i < remaining_bytes_in_cur_char; i++)
utf8_decode(&state, &cp, (uint8_t)buf_ptr(t.buf)[t.pos + i]);
if (state != UTF8_ACCEPT)
tokenize_error(&t, "invalid UTF-8");
remaining_bytes_in_cur_char--;
if (t.state == TokenizeStateCharLiteral) {
t.pos += remaining_bytes_in_cur_char;
remaining_bytes_in_cur_char = 0;
}
} else
invalid_char_error(&t, c);
}
switch (t.state) {
case TokenizeStateError:
break;
Expand Down Expand Up @@ -1050,24 +1087,14 @@ void tokenize(Buf *buf, Tokenization *out) {
t.state = TokenizeStateCharCode;
t.radix = 16;
t.char_code = 0;
t.char_code_index = 0;
t.char_code_end = 2;
t.xdigits_seen = 0;
t.unicode = false;
break;
case 'u':
t.state = TokenizeStateCharCode;
t.radix = 16;
t.char_code = 0;
t.char_code_index = 0;
t.char_code_end = 4;
t.unicode = true;
break;
case 'U':
t.state = TokenizeStateCharCode;
t.state = TokenizeStateCharCodeStart;
t.radix = 16;
t.char_code = 0;
t.char_code_index = 0;
t.char_code_end = 6;
t.xdigits_seen = 0;
t.unicode = true;
break;
case 'n':
Expand All @@ -1092,20 +1119,35 @@ void tokenize(Buf *buf, Tokenization *out) {
invalid_char_error(&t, c);
}
break;
case TokenizeStateCharCodeStart:
if (c != '{')
tokenize_error(&t, "expected {: '%c'", c);
t.state = TokenizeStateCharCode;
break;
case TokenizeStateCharCode:
{
uint32_t digit_value = get_digit_value(c);
if (digit_value >= t.radix) {
tokenize_error(&t, "invalid digit: '%c'", c);
}
t.char_code *= t.radix;
t.char_code += digit_value;
t.char_code_index += 1;
if (c != '}') {
uint32_t digit_value = get_digit_value(c);
if (digit_value >= t.radix) {
tokenize_error(&t, "invalid digit: '%c'", c);
}
t.char_code *= t.radix;
t.char_code += digit_value;
t.xdigits_seen += 1;

if (t.xdigits_seen > 6)
tokenize_error(&t, "expected }: '%c'", c);
} else
if (t.xdigits_seen % 2 != 0)
tokenize_error(&t, "expected hex digit: '%c'", c);

if (t.char_code_index >= t.char_code_end) {
if (c == '}' || (!t.unicode && t.xdigits_seen == 2)) {
if (t.unicode) {
if (t.char_code > 0x10ffff) {
tokenize_error(&t, "unicode value out of range: %x", t.char_code);
if (t.char_code > 0xD7FF &&
t.char_code < 0xE000) {
tokenize_error(&t, "unicode surrogate: 0x%x", t.char_code);
} else if (t.char_code > 0x10ffff) {
tokenize_error(&t, "unicode value out of range: 0x%x", t.char_code);
}
if (t.cur_tok->id == TokenIdCharLiteral) {
t.cur_tok->data.char_lit.c = t.char_code;
Expand Down Expand Up @@ -1149,8 +1191,15 @@ void tokenize(Buf *buf, Tokenization *out) {
case '\\':
t.state = TokenizeStateStringEscape;
break;
case '\n':
tokenize_error(&t, "newline not allowed in character literal");
default:
t.cur_tok->data.char_lit.c = c;
if (c < 128)
t.cur_tok->data.char_lit.c = c;
else {
// the utf8 parser/validator skipped forward for us and provided us this
t.cur_tok->data.char_lit.c = cp;
}
t.state = TokenizeStateCharLiteralEnd;
break;
}
Expand Down Expand Up @@ -1387,6 +1436,7 @@ void tokenize(Buf *buf, Tokenization *out) {
break;
case TokenizeStateStringEscape:
case TokenizeStateCharCode:
case TokenizeStateCharCodeStart:
if (t.cur_tok->id == TokenIdStringLiteral) {
tokenize_error(&t, "unterminated string");
} else if (t.cur_tok->id == TokenIdCharLiteral) {
Expand Down

0 comments on commit eea8e77

Please sign in to comment.