Skip to content

Commit 2b3d59f

Browse files
committed
Parse all magic comments
1 parent e3e4cb0 commit 2b3d59f

File tree

2 files changed

+179
-79
lines changed

2 files changed

+179
-79
lines changed

src/prism.c

Lines changed: 143 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -5213,66 +5213,17 @@ next_newline(const uint8_t *cursor, ptrdiff_t length) {
52135213
return memchr(cursor, '\n', (size_t) length);
52145214
}
52155215

5216-
// Find the start of the encoding comment. This is effectively an inlined
5217-
// version of strnstr with some modifications.
5218-
static inline const uint8_t *
5219-
parser_lex_encoding_comment_start(pm_parser_t *parser, const uint8_t *cursor, ptrdiff_t remaining) {
5220-
assert(remaining >= 0);
5221-
size_t length = (size_t) remaining;
5222-
5223-
size_t key_length = strlen("coding:");
5224-
if (key_length > length) return NULL;
5225-
5226-
const uint8_t *cursor_limit = cursor + length - key_length + 1;
5227-
while ((cursor = pm_memchr(cursor, 'c', (size_t) (cursor_limit - cursor), parser->encoding_changed, &parser->encoding)) != NULL) {
5228-
if (memcmp(cursor, "coding", key_length - 1) == 0) {
5229-
size_t whitespace_after_coding = pm_strspn_inline_whitespace(cursor + key_length - 1, parser->end - (cursor + key_length - 1));
5230-
size_t cur_pos = key_length + whitespace_after_coding;
5231-
5232-
if (cursor[cur_pos - 1] == ':' || cursor[cur_pos - 1] == '=') {
5233-
return cursor + cur_pos;
5234-
}
5235-
}
5236-
5237-
cursor++;
5238-
}
5239-
5240-
return NULL;
5241-
}
5242-
52435216
// Here we're going to check if this is a "magic" comment, and perform whatever
52445217
// actions are necessary for it here.
52455218
static void
5246-
parser_lex_encoding_comment(pm_parser_t *parser) {
5247-
const uint8_t *start = parser->current.start + 1;
5248-
const uint8_t *end = parser->current.end;
5249-
5250-
// These are the patterns we're going to match to find the encoding comment.
5251-
// This is definitely not complete or even really correct.
5252-
const uint8_t *encoding_start = parser_lex_encoding_comment_start(parser, start, end - start);
5253-
5254-
// If we didn't find anything that matched our patterns, then return. Note
5255-
// that this does a _very_ poor job of actually finding the encoding, and
5256-
// there is a lot of work to do here to better reflect actual magic comment
5257-
// parsing from CRuby, but this at least gets us part of the way there.
5258-
if (encoding_start == NULL) return;
5259-
5260-
// Skip any non-newline whitespace after the "coding:" or "coding=".
5261-
encoding_start += pm_strspn_inline_whitespace(encoding_start, end - encoding_start);
5262-
5263-
// Now determine the end of the encoding string. This is either the end of
5264-
// the line, the first whitespace character, or a punctuation mark.
5265-
const uint8_t *encoding_end = pm_strpbrk(parser, encoding_start, (const uint8_t *) " \t\f\r\v\n;,", end - encoding_start);
5266-
encoding_end = encoding_end == NULL ? end : encoding_end;
5267-
5268-
// Finally, we can determine the width of the encoding string.
5269-
size_t width = (size_t) (encoding_end - encoding_start);
5219+
parser_lex_magic_comment_encoding(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) {
5220+
size_t width = (size_t) (end - start);
52705221

52715222
// First, we're going to call out to a user-defined callback if one was
52725223
// provided. If they return an encoding struct that we can use, then we'll
52735224
// use that here.
52745225
if (parser->encoding_decode_callback != NULL) {
5275-
pm_encoding_t *encoding = parser->encoding_decode_callback(parser, encoding_start, width);
5226+
pm_encoding_t *encoding = parser->encoding_decode_callback(parser, start, width);
52765227

52775228
if (encoding != NULL) {
52785229
parser->encoding = *encoding;
@@ -5284,7 +5235,7 @@ parser_lex_encoding_comment(pm_parser_t *parser) {
52845235
// Extensions like utf-8 can contain extra encoding details like,
52855236
// utf-8-dos, utf-8-linux, utf-8-mac. We treat these all as utf-8 should
52865237
// treat any encoding starting utf-8 as utf-8.
5287-
if ((encoding_start + 5 <= parser->end) && (pm_strncasecmp(encoding_start, (const uint8_t *) "utf-8", 5) == 0)) {
5238+
if ((start + 5 <= end) && (pm_strncasecmp(start, (const uint8_t *) "utf-8", 5) == 0)) {
52885239
// We don't need to do anything here because the default encoding is
52895240
// already UTF-8. We'll just return.
52905241
return;
@@ -5293,7 +5244,7 @@ parser_lex_encoding_comment(pm_parser_t *parser) {
52935244
// Next, we're going to loop through each of the encodings that we handle
52945245
// explicitly. If we found one that we understand, we'll use that value.
52955246
#define ENCODING(value, prebuilt) \
5296-
if (width == sizeof(value) - 1 && encoding_start + width <= parser->end && pm_strncasecmp(encoding_start, (const uint8_t *) value, width) == 0) { \
5247+
if (width == sizeof(value) - 1 && start + width <= end && pm_strncasecmp(start, (const uint8_t *) value, width) == 0) { \
52975248
parser->encoding = prebuilt; \
52985249
parser->encoding_changed |= true; \
52995250
if (parser->encoding_changed_callback != NULL) parser->encoding_changed_callback(parser); \
@@ -5342,39 +5293,156 @@ parser_lex_encoding_comment(pm_parser_t *parser) {
53425293
// didn't understand the encoding that the user was trying to use. In this
53435294
// case we'll keep using the default encoding but add an error to the
53445295
// parser to indicate an unsuccessful parse.
5345-
pm_parser_err(parser, encoding_start, encoding_end, PM_ERR_INVALID_ENCODING_MAGIC_COMMENT);
5296+
pm_parser_err(parser, start, end, PM_ERR_INVALID_ENCODING_MAGIC_COMMENT);
53465297
}
53475298

53485299
// Check if this is a magic comment that includes the frozen_string_literal
53495300
// pragma. If it does, set that field on the parser.
53505301
static void
5351-
parser_lex_frozen_string_literal_comment(pm_parser_t *parser) {
5352-
const uint8_t *cursor = parser->current.start + 1;
5302+
parser_lex_magic_comment_frozen_string_literal(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) {
5303+
if (start + 4 <= end && pm_strncasecmp(start, (const uint8_t *) "true", 4) == 0) {
5304+
parser->frozen_string_literal = true;
5305+
}
5306+
}
5307+
5308+
static inline bool
5309+
pm_char_is_magic_comment_key_delimiter(const uint8_t b) {
5310+
return b == '\'' || b == '"' || b == ':' || b == ';';
5311+
}
5312+
5313+
// Find an emacs magic comment marker (-*-) within the given bounds. If one is
5314+
// found, it returns a pointer to the start of the marker. Otherwise it returns
5315+
// NULL.
5316+
static inline const uint8_t *
5317+
parser_lex_magic_comment_emacs_marker(pm_parser_t *parser, const uint8_t *cursor, const uint8_t *end) {
5318+
while ((cursor + 3 <= end) && (cursor = pm_memchr(cursor, '-', (size_t) (end - cursor), parser->encoding_changed, &parser->encoding)) != NULL) {
5319+
if (cursor + 3 <= end && cursor[1] == '*' && cursor[2] == '-') {
5320+
return cursor;
5321+
}
5322+
cursor++;
5323+
}
5324+
return NULL;
5325+
}
5326+
5327+
// Parse the current token on the parser to see if it's a magic comment and
5328+
// potentially perform some action based on that. A regular expression that this
5329+
// function is effectively matching is:
5330+
//
5331+
// %r"([^\\s\'\":;]+)\\s*:\\s*(\"(?:\\\\.|[^\"])*\"|[^\"\\s;]+)[\\s;]*"
5332+
//
5333+
static inline void
5334+
parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) {
5335+
const uint8_t *start = parser->current.start + 1;
53535336
const uint8_t *end = parser->current.end;
53545337

5355-
size_t key_length = strlen("frozen_string_literal");
5356-
if (key_length > (size_t) (end - cursor)) return;
5338+
const uint8_t *cursor;
5339+
bool indicator = false;
5340+
5341+
if ((cursor = parser_lex_magic_comment_emacs_marker(parser, start, end)) != NULL) {
5342+
start = cursor + 3;
53575343

5358-
const uint8_t *cursor_limit = cursor + (end - cursor) - key_length + 1;
5344+
if ((cursor = parser_lex_magic_comment_emacs_marker(parser, start, end)) != NULL) {
5345+
end = cursor;
5346+
indicator = true;
5347+
} else {
5348+
// If we have a start marker but not an end marker, then we cannot
5349+
// have a magic comment.
5350+
return;
5351+
}
5352+
}
53595353

5360-
while ((cursor = pm_memchr(cursor, 'f', (size_t) (cursor_limit - cursor), parser->encoding_changed, &parser->encoding)) != NULL) {
5361-
if (memcmp(cursor, "frozen_string_literal", key_length) == 0) {
5362-
cursor += key_length;
5363-
cursor += pm_strspn_inline_whitespace(cursor, end - cursor);
5354+
cursor = start;
5355+
while (cursor < end) {
5356+
while (cursor < end && (pm_char_is_magic_comment_key_delimiter(*cursor) || pm_char_is_whitespace(*cursor))) cursor++;
53645357

5365-
if (*cursor == ':' || *cursor == '=') {
5366-
cursor++;
5367-
cursor += pm_strspn_inline_whitespace(cursor, end - cursor);
5358+
const uint8_t *key_start = cursor;
5359+
while (cursor < end && (!pm_char_is_magic_comment_key_delimiter(*cursor) && !pm_char_is_whitespace(*cursor))) cursor++;
53685360

5369-
if (cursor + 4 <= end && pm_strncasecmp(cursor, (const uint8_t *) "true", 4) == 0) {
5370-
parser->frozen_string_literal = true;
5371-
}
5361+
const uint8_t *key_end = cursor;
5362+
while (cursor < end && pm_char_is_whitespace(*cursor)) cursor++;
5363+
if (cursor == end) return;
53725364

5373-
return;
5365+
if (*cursor == ':') {
5366+
cursor++;
5367+
} else {
5368+
if (!indicator) return;
5369+
continue;
5370+
}
5371+
5372+
while (cursor < end && pm_char_is_whitespace(*cursor)) cursor++;
5373+
if (cursor == end) return;
5374+
5375+
const uint8_t *value_start;
5376+
const uint8_t *value_end;
5377+
5378+
if (*cursor == '"') {
5379+
value_start = ++cursor;
5380+
for (; cursor < end && *cursor != '"'; cursor++) {
5381+
if (*cursor == '\\' && (cursor + 1 < end)) cursor++;
53745382
}
5383+
value_end = cursor;
5384+
} else {
5385+
value_start = cursor;
5386+
while (cursor < end && *cursor != '"' && *cursor != ';' && !pm_char_is_whitespace(*cursor)) cursor++;
5387+
value_end = cursor;
53755388
}
53765389

5377-
cursor++;
5390+
if (indicator) {
5391+
while (cursor < end && (*cursor == ';' || pm_char_is_whitespace(*cursor))) cursor++;
5392+
} else {
5393+
while (cursor < end && pm_char_is_whitespace(*cursor)) cursor++;
5394+
if (cursor != end) return;
5395+
}
5396+
5397+
// Here, we need to do some processing on the key to swap out dashes for
5398+
// underscores. We only need to do this if there _is_ a dash in the key.
5399+
pm_string_t key;
5400+
const uint8_t *dash = pm_memchr(key_start, '-', (size_t) (key_end - key_start), parser->encoding_changed, &parser->encoding);
5401+
5402+
if (dash == NULL) {
5403+
pm_string_shared_init(&key, key_start, key_end);
5404+
} else {
5405+
size_t width = (size_t) (key_end - key_start);
5406+
uint8_t *buffer = malloc(width);
5407+
if (buffer == NULL) return;
5408+
5409+
memcpy(buffer, key_start, width);
5410+
buffer[dash - key_start] = '_';
5411+
5412+
while ((dash = pm_memchr(dash + 1, '-', (size_t) (key_end - dash - 1), parser->encoding_changed, &parser->encoding)) != NULL) {
5413+
buffer[dash - key_start] = '_';
5414+
}
5415+
5416+
pm_string_owned_init(&key, buffer, width);
5417+
}
5418+
5419+
// Finally, we can start checking the key against the list of known
5420+
// magic comment keys, and potentially change state based on that.
5421+
const char *key_source = (const char *) pm_string_source(&key);
5422+
const size_t key_length = pm_string_length(&key);
5423+
5424+
// We only want to attempt to compare against encoding comments if it's
5425+
// the first line in the file (or the second in the case of a shebang).
5426+
if (parser->current.start == parser->encoding_comment_start) {
5427+
if (
5428+
(key_length == 8 && strncasecmp(key_source, "encoding", 8) == 0) ||
5429+
(key_length == 6 && strncasecmp(key_source, "coding", 6) == 0)
5430+
) {
5431+
parser_lex_magic_comment_encoding(parser, value_start, value_end);
5432+
}
5433+
}
5434+
5435+
// We only want to handle frozen string literal comments if it's before
5436+
// any semantic tokens have been seen.
5437+
if (!semantic_token_seen) {
5438+
if (key_length == 21 && strncasecmp(key_source, "frozen_string_literal", 21) == 0) {
5439+
parser_lex_magic_comment_frozen_string_literal(parser, value_start, value_end);
5440+
}
5441+
}
5442+
5443+
// When we're done, we want to free the string in case we had to
5444+
// allocate memory for it.
5445+
pm_string_free(&key);
53785446
}
53795447
}
53805448

@@ -6976,13 +7044,9 @@ parser_lex(pm_parser_t *parser) {
69767044
parser->current.type = PM_TOKEN_COMMENT;
69777045
parser_lex_callback(parser);
69787046

6979-
if (parser->current.start == parser->encoding_comment_start) {
6980-
parser_lex_encoding_comment(parser);
6981-
}
6982-
6983-
if (!semantic_token_seen) {
6984-
parser_lex_frozen_string_literal_comment(parser);
6985-
}
7047+
// Here, parse the comment to see if it's a magic comment
7048+
// and potentially change state on the parser.
7049+
parser_lex_magic_comment(parser, semantic_token_seen);
69867050

69877051
lexed_comment = true;
69887052
}

test/prism/magic_comment_test.rb

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# frozen_string_literal: true
2+
3+
require_relative "test_helper"
4+
5+
module Prism
6+
class MagicCommentTest < TestCase
7+
examples = [
8+
"# encoding: ascii",
9+
"# coding: ascii",
10+
"# eNcOdInG: ascii",
11+
"# CoDiNg: ascii",
12+
"# \s\t\v encoding \s\t\v : \s\t\v ascii \s\t\v",
13+
"# -*- encoding: ascii -*-",
14+
"# -*- coding: ascii -*-",
15+
"# -*- eNcOdInG: ascii -*-",
16+
"# -*- CoDiNg: ascii -*-",
17+
"# -*- \s\t\v encoding \s\t\v : \s\t\v ascii \s\t\v -*-",
18+
"# -*- foo: bar; encoding: ascii -*-",
19+
"# coding \t \r \v : \t \v \r ascii-8bit\n"
20+
]
21+
22+
examples.each do |example|
23+
define_method(:"test_magic_comment_#{example}") do
24+
assert_magic_comment(example)
25+
end
26+
end
27+
28+
private
29+
30+
def assert_magic_comment(example)
31+
expected = Ripper.new(example).tap(&:parse).encoding
32+
actual = Prism.parse(example).source.source.encoding
33+
assert_equal expected, actual
34+
end
35+
end
36+
end

0 commit comments

Comments
 (0)