@@ -5213,66 +5213,17 @@ next_newline(const uint8_t *cursor, ptrdiff_t length) {
5213
5213
return memchr(cursor, '\n', (size_t) length);
5214
5214
}
5215
5215
5216
- // Find the start of the encoding comment. This is effectively an inlined
5217
- // version of strnstr with some modifications.
5218
- static inline const uint8_t *
5219
- parser_lex_encoding_comment_start(pm_parser_t *parser, const uint8_t *cursor, ptrdiff_t remaining) {
5220
- assert(remaining >= 0);
5221
- size_t length = (size_t) remaining;
5222
-
5223
- size_t key_length = strlen("coding:");
5224
- if (key_length > length) return NULL;
5225
-
5226
- const uint8_t *cursor_limit = cursor + length - key_length + 1;
5227
- while ((cursor = pm_memchr(cursor, 'c', (size_t) (cursor_limit - cursor), parser->encoding_changed, &parser->encoding)) != NULL) {
5228
- if (memcmp(cursor, "coding", key_length - 1) == 0) {
5229
- size_t whitespace_after_coding = pm_strspn_inline_whitespace(cursor + key_length - 1, parser->end - (cursor + key_length - 1));
5230
- size_t cur_pos = key_length + whitespace_after_coding;
5231
-
5232
- if (cursor[cur_pos - 1] == ':' || cursor[cur_pos - 1] == '=') {
5233
- return cursor + cur_pos;
5234
- }
5235
- }
5236
-
5237
- cursor++;
5238
- }
5239
-
5240
- return NULL;
5241
- }
5242
-
5243
5216
// Here we're going to check if this is a "magic" comment, and perform whatever
5244
5217
// actions are necessary for it here.
5245
5218
static void
5246
- parser_lex_encoding_comment(pm_parser_t *parser) {
5247
- const uint8_t *start = parser->current.start + 1;
5248
- const uint8_t *end = parser->current.end;
5249
-
5250
- // These are the patterns we're going to match to find the encoding comment.
5251
- // This is definitely not complete or even really correct.
5252
- const uint8_t *encoding_start = parser_lex_encoding_comment_start(parser, start, end - start);
5253
-
5254
- // If we didn't find anything that matched our patterns, then return. Note
5255
- // that this does a _very_ poor job of actually finding the encoding, and
5256
- // there is a lot of work to do here to better reflect actual magic comment
5257
- // parsing from CRuby, but this at least gets us part of the way there.
5258
- if (encoding_start == NULL) return;
5259
-
5260
- // Skip any non-newline whitespace after the "coding:" or "coding=".
5261
- encoding_start += pm_strspn_inline_whitespace(encoding_start, end - encoding_start);
5262
-
5263
- // Now determine the end of the encoding string. This is either the end of
5264
- // the line, the first whitespace character, or a punctuation mark.
5265
- const uint8_t *encoding_end = pm_strpbrk(parser, encoding_start, (const uint8_t *) " \t\f\r\v\n;,", end - encoding_start);
5266
- encoding_end = encoding_end == NULL ? end : encoding_end;
5267
-
5268
- // Finally, we can determine the width of the encoding string.
5269
- size_t width = (size_t) (encoding_end - encoding_start);
5219
+ parser_lex_magic_comment_encoding(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) {
5220
+ size_t width = (size_t) (end - start);
5270
5221
5271
5222
// First, we're going to call out to a user-defined callback if one was
5272
5223
// provided. If they return an encoding struct that we can use, then we'll
5273
5224
// use that here.
5274
5225
if (parser->encoding_decode_callback != NULL) {
5275
- pm_encoding_t *encoding = parser->encoding_decode_callback(parser, encoding_start , width);
5226
+ pm_encoding_t *encoding = parser->encoding_decode_callback(parser, start , width);
5276
5227
5277
5228
if (encoding != NULL) {
5278
5229
parser->encoding = *encoding;
@@ -5284,7 +5235,7 @@ parser_lex_encoding_comment(pm_parser_t *parser) {
5284
5235
// Extensions like utf-8 can contain extra encoding details like,
5285
5236
// utf-8-dos, utf-8-linux, utf-8-mac. We treat these all as utf-8 should
5286
5237
// treat any encoding starting utf-8 as utf-8.
5287
- if ((encoding_start + 5 <= parser-> end) && (pm_strncasecmp(encoding_start , (const uint8_t *) "utf-8", 5) == 0)) {
5238
+ if ((start + 5 <= end) && (pm_strncasecmp(start , (const uint8_t *) "utf-8", 5) == 0)) {
5288
5239
// We don't need to do anything here because the default encoding is
5289
5240
// already UTF-8. We'll just return.
5290
5241
return;
@@ -5293,7 +5244,7 @@ parser_lex_encoding_comment(pm_parser_t *parser) {
5293
5244
// Next, we're going to loop through each of the encodings that we handle
5294
5245
// explicitly. If we found one that we understand, we'll use that value.
5295
5246
#define ENCODING(value, prebuilt) \
5296
- if (width == sizeof(value) - 1 && encoding_start + width <= parser-> end && pm_strncasecmp(encoding_start , (const uint8_t *) value, width) == 0) { \
5247
+ if (width == sizeof(value) - 1 && start + width <= end && pm_strncasecmp(start , (const uint8_t *) value, width) == 0) { \
5297
5248
parser->encoding = prebuilt; \
5298
5249
parser->encoding_changed |= true; \
5299
5250
if (parser->encoding_changed_callback != NULL) parser->encoding_changed_callback(parser); \
@@ -5342,39 +5293,156 @@ parser_lex_encoding_comment(pm_parser_t *parser) {
5342
5293
// didn't understand the encoding that the user was trying to use. In this
5343
5294
// case we'll keep using the default encoding but add an error to the
5344
5295
// parser to indicate an unsuccessful parse.
5345
- pm_parser_err(parser, encoding_start, encoding_end , PM_ERR_INVALID_ENCODING_MAGIC_COMMENT);
5296
+ pm_parser_err(parser, start, end , PM_ERR_INVALID_ENCODING_MAGIC_COMMENT);
5346
5297
}
5347
5298
5348
5299
// Check if this is a magic comment that includes the frozen_string_literal
5349
5300
// pragma. If it does, set that field on the parser.
5350
5301
static void
5351
- parser_lex_frozen_string_literal_comment(pm_parser_t *parser) {
5352
- const uint8_t *cursor = parser->current.start + 1;
5302
+ parser_lex_magic_comment_frozen_string_literal(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) {
5303
+ if (start + 4 <= end && pm_strncasecmp(start, (const uint8_t *) "true", 4) == 0) {
5304
+ parser->frozen_string_literal = true;
5305
+ }
5306
+ }
5307
+
5308
+ static inline bool
5309
+ pm_char_is_magic_comment_key_delimiter(const uint8_t b) {
5310
+ return b == '\'' || b == '"' || b == ':' || b == ';';
5311
+ }
5312
+
5313
+ // Find an emacs magic comment marker (-*-) within the given bounds. If one is
5314
+ // found, it returns a pointer to the start of the marker. Otherwise it returns
5315
+ // NULL.
5316
+ static inline const uint8_t *
5317
+ parser_lex_magic_comment_emacs_marker(pm_parser_t *parser, const uint8_t *cursor, const uint8_t *end) {
5318
+ while ((cursor + 3 <= end) && (cursor = pm_memchr(cursor, '-', (size_t) (end - cursor), parser->encoding_changed, &parser->encoding)) != NULL) {
5319
+ if (cursor + 3 <= end && cursor[1] == '*' && cursor[2] == '-') {
5320
+ return cursor;
5321
+ }
5322
+ cursor++;
5323
+ }
5324
+ return NULL;
5325
+ }
5326
+
5327
+ // Parse the current token on the parser to see if it's a magic comment and
5328
+ // potentially perform some action based on that. A regular expression that this
5329
+ // function is effectively matching is:
5330
+ //
5331
+ // %r"([^\\s\'\":;]+)\\s*:\\s*(\"(?:\\\\.|[^\"])*\"|[^\"\\s;]+)[\\s;]*"
5332
+ //
5333
+ static inline void
5334
+ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) {
5335
+ const uint8_t *start = parser->current.start + 1;
5353
5336
const uint8_t *end = parser->current.end;
5354
5337
5355
- size_t key_length = strlen("frozen_string_literal");
5356
- if (key_length > (size_t) (end - cursor)) return;
5338
+ const uint8_t *cursor;
5339
+ bool indicator = false;
5340
+
5341
+ if ((cursor = parser_lex_magic_comment_emacs_marker(parser, start, end)) != NULL) {
5342
+ start = cursor + 3;
5357
5343
5358
- const uint8_t *cursor_limit = cursor + (end - cursor) - key_length + 1;
5344
+ if ((cursor = parser_lex_magic_comment_emacs_marker(parser, start, end)) != NULL) {
5345
+ end = cursor;
5346
+ indicator = true;
5347
+ } else {
5348
+ // If we have a start marker but not an end marker, then we cannot
5349
+ // have a magic comment.
5350
+ return;
5351
+ }
5352
+ }
5359
5353
5360
- while ((cursor = pm_memchr(cursor, 'f', (size_t) (cursor_limit - cursor), parser->encoding_changed, &parser->encoding)) != NULL) {
5361
- if (memcmp(cursor, "frozen_string_literal", key_length) == 0) {
5362
- cursor += key_length;
5363
- cursor += pm_strspn_inline_whitespace(cursor, end - cursor);
5354
+ cursor = start;
5355
+ while (cursor < end) {
5356
+ while (cursor < end && (pm_char_is_magic_comment_key_delimiter(*cursor) || pm_char_is_whitespace(*cursor))) cursor++;
5364
5357
5365
- if (*cursor == ':' || *cursor == '=') {
5366
- cursor++;
5367
- cursor += pm_strspn_inline_whitespace(cursor, end - cursor);
5358
+ const uint8_t *key_start = cursor;
5359
+ while (cursor < end && (!pm_char_is_magic_comment_key_delimiter(*cursor) && !pm_char_is_whitespace(*cursor))) cursor++;
5368
5360
5369
- if (cursor + 4 <= end && pm_strncasecmp(cursor, ( const uint8_t *) "true", 4) == 0) {
5370
- parser->frozen_string_literal = true ;
5371
- }
5361
+ const uint8_t *key_end = cursor;
5362
+ while (cursor < end && pm_char_is_whitespace(*cursor)) cursor++ ;
5363
+ if (cursor == end) return;
5372
5364
5373
- return;
5365
+ if (*cursor == ':') {
5366
+ cursor++;
5367
+ } else {
5368
+ if (!indicator) return;
5369
+ continue;
5370
+ }
5371
+
5372
+ while (cursor < end && pm_char_is_whitespace(*cursor)) cursor++;
5373
+ if (cursor == end) return;
5374
+
5375
+ const uint8_t *value_start;
5376
+ const uint8_t *value_end;
5377
+
5378
+ if (*cursor == '"') {
5379
+ value_start = ++cursor;
5380
+ for (; cursor < end && *cursor != '"'; cursor++) {
5381
+ if (*cursor == '\\' && (cursor + 1 < end)) cursor++;
5374
5382
}
5383
+ value_end = cursor;
5384
+ } else {
5385
+ value_start = cursor;
5386
+ while (cursor < end && *cursor != '"' && *cursor != ';' && !pm_char_is_whitespace(*cursor)) cursor++;
5387
+ value_end = cursor;
5375
5388
}
5376
5389
5377
- cursor++;
5390
+ if (indicator) {
5391
+ while (cursor < end && (*cursor == ';' || pm_char_is_whitespace(*cursor))) cursor++;
5392
+ } else {
5393
+ while (cursor < end && pm_char_is_whitespace(*cursor)) cursor++;
5394
+ if (cursor != end) return;
5395
+ }
5396
+
5397
+ // Here, we need to do some processing on the key to swap out dashes for
5398
+ // underscores. We only need to do this if there _is_ a dash in the key.
5399
+ pm_string_t key;
5400
+ const uint8_t *dash = pm_memchr(key_start, '-', (size_t) (key_end - key_start), parser->encoding_changed, &parser->encoding);
5401
+
5402
+ if (dash == NULL) {
5403
+ pm_string_shared_init(&key, key_start, key_end);
5404
+ } else {
5405
+ size_t width = (size_t) (key_end - key_start);
5406
+ uint8_t *buffer = malloc(width);
5407
+ if (buffer == NULL) return;
5408
+
5409
+ memcpy(buffer, key_start, width);
5410
+ buffer[dash - key_start] = '_';
5411
+
5412
+ while ((dash = pm_memchr(dash + 1, '-', (size_t) (key_end - dash - 1), parser->encoding_changed, &parser->encoding)) != NULL) {
5413
+ buffer[dash - key_start] = '_';
5414
+ }
5415
+
5416
+ pm_string_owned_init(&key, buffer, width);
5417
+ }
5418
+
5419
+ // Finally, we can start checking the key against the list of known
5420
+ // magic comment keys, and potentially change state based on that.
5421
+ const char *key_source = (const char *) pm_string_source(&key);
5422
+ const size_t key_length = pm_string_length(&key);
5423
+
5424
+ // We only want to attempt to compare against encoding comments if it's
5425
+ // the first line in the file (or the second in the case of a shebang).
5426
+ if (parser->current.start == parser->encoding_comment_start) {
5427
+ if (
5428
+ (key_length == 8 && strncasecmp(key_source, "encoding", 8) == 0) ||
5429
+ (key_length == 6 && strncasecmp(key_source, "coding", 6) == 0)
5430
+ ) {
5431
+ parser_lex_magic_comment_encoding(parser, value_start, value_end);
5432
+ }
5433
+ }
5434
+
5435
+ // We only want to handle frozen string literal comments if it's before
5436
+ // any semantic tokens have been seen.
5437
+ if (!semantic_token_seen) {
5438
+ if (key_length == 21 && strncasecmp(key_source, "frozen_string_literal", 21) == 0) {
5439
+ parser_lex_magic_comment_frozen_string_literal(parser, value_start, value_end);
5440
+ }
5441
+ }
5442
+
5443
+ // When we're done, we want to free the string in case we had to
5444
+ // allocate memory for it.
5445
+ pm_string_free(&key);
5378
5446
}
5379
5447
}
5380
5448
@@ -6976,13 +7044,9 @@ parser_lex(pm_parser_t *parser) {
6976
7044
parser->current.type = PM_TOKEN_COMMENT;
6977
7045
parser_lex_callback(parser);
6978
7046
6979
- if (parser->current.start == parser->encoding_comment_start) {
6980
- parser_lex_encoding_comment(parser);
6981
- }
6982
-
6983
- if (!semantic_token_seen) {
6984
- parser_lex_frozen_string_literal_comment(parser);
6985
- }
7047
+ // Here, parse the comment to see if it's a magic comment
7048
+ // and potentially change state on the parser.
7049
+ parser_lex_magic_comment(parser, semantic_token_seen);
6986
7050
6987
7051
lexed_comment = true;
6988
7052
}
0 commit comments