Skip to content

Commit 4eaaa90

Browse files
committed
Serialize the newline_list to avoid recomputing it again later
* Fixes #2380
1 parent 18ada31 commit 4eaaa90

File tree

10 files changed

+89
-73
lines changed

10 files changed

+89
-73
lines changed

docs/serialization.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,8 @@ The header is structured like the following table:
8585
| `1` | 1 indicates only semantics fields were serialized, 0 indicates all fields were serialized (including location fields) |
8686
| string | the encoding name |
8787
| varsint | the start line |
88+
| varuint | number of newline offsets |
89+
| varuint* | newline offsets |
8890
| varuint | number of comments |
8991
| comment* | comments |
9092
| varuint | number of magic comments |

ext/prism/extension.c

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -542,9 +542,9 @@ parse_lex_input(pm_string_t *input, const pm_options_t *options, bool return_nod
542542
pm_parser_init(&parser, pm_string_source(input), pm_string_length(input), options);
543543
pm_parser_register_encoding_changed_callback(&parser, parse_lex_encoding_changed_callback);
544544

545-
VALUE offsets = rb_ary_new();
546-
VALUE source_argv[] = { rb_str_new((const char *) pm_string_source(input), pm_string_length(input)), ULONG2NUM(parser.start_line), offsets };
547-
VALUE source = rb_class_new_instance(3, source_argv, rb_cPrismSource);
545+
VALUE source_string = rb_str_new((const char *) pm_string_source(input), pm_string_length(input));
546+
VALUE source_argv[] = { source_string };
547+
VALUE source = rb_class_new_instance(1, source_argv, rb_cPrismSource);
548548

549549
parse_lex_data_t parse_lex_data = {
550550
.source = source,
@@ -561,17 +561,18 @@ parse_lex_input(pm_string_t *input, const pm_options_t *options, bool return_nod
561561
parser.lex_callback = &lex_callback;
562562
pm_node_t *node = pm_parse(&parser);
563563

564-
// Here we need to update the source range to have the correct newline
565-
// offsets. We do it here because we've already created the object and given
566-
// it over to all of the tokens.
567-
for (size_t index = 0; index < parser.newline_list.size; index++) {
568-
rb_ary_push(offsets, INT2FIX(parser.newline_list.offsets[index]));
569-
}
564+
// Here we need to update the Source object to have the correct
565+
// encoding for the source string and the correct newline offsets.
566+
// We do it here because we've already created the Source object and given
567+
// it over to all of the tokens, and both of these are only set after pm_parse().
568+
rb_encoding *encoding = rb_enc_find(parser.encoding->name);
569+
rb_enc_associate(source_string, encoding);
570+
pm_source_init(source, &parser);
570571

571572
VALUE value;
572573
if (return_nodes) {
573574
value = rb_ary_new_capa(2);
574-
rb_ary_push(value, pm_ast_new(&parser, node, parse_lex_data.encoding));
575+
rb_ary_push(value, pm_ast_new(&parser, node, parse_lex_data.encoding, source));
575576
rb_ary_push(value, parse_lex_data.tokens);
576577
} else {
577578
value = parse_lex_data.tokens;
@@ -650,7 +651,7 @@ parse_input(pm_string_t *input, const pm_options_t *options) {
650651

651652
VALUE source = pm_source_new(&parser, encoding);
652653
VALUE result_argv[] = {
653-
pm_ast_new(&parser, node, encoding),
654+
pm_ast_new(&parser, node, encoding, source),
654655
parser_comments(&parser, source),
655656
parser_magic_comments(&parser, source),
656657
parser_data_loc(&parser, source),

ext/prism/extension.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@
88
#include "prism.h"
99

1010
VALUE pm_source_new(pm_parser_t *parser, rb_encoding *encoding);
11+
void pm_source_init(VALUE source, pm_parser_t *parser);
1112
VALUE pm_token_new(pm_parser_t *parser, pm_token_t *token, rb_encoding *encoding, VALUE source);
12-
VALUE pm_ast_new(pm_parser_t *parser, pm_node_t *node, rb_encoding *encoding);
13+
VALUE pm_ast_new(pm_parser_t *parser, pm_node_t *node, rb_encoding *encoding, VALUE source);
1314

1415
void Init_prism_api_node(void);
1516
void Init_prism_pack(void);

lib/prism/parse_result.rb

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,13 @@ class Source
1212
attr_accessor :start_line
1313

1414
# The list of newline byte offsets in the source code.
15-
attr_reader :offsets
15+
attr_accessor :offsets
1616

17-
# Create a new source object with the given source code and newline byte
18-
# offsets. If no newline byte offsets are given, they will be computed from
19-
# the source code.
20-
def initialize(source, start_line = 1, offsets = compute_offsets(source))
17+
# Create a new source object with the given source code.
18+
def initialize(source)
2119
@source = source
22-
@start_line = start_line
23-
@offsets = offsets
20+
@start_line = 1 # set after parsing is done
21+
@offsets = [] # set after parsing is done
2422
end
2523

2624
# Perform a byteslice on the source code using the given byte offset and
@@ -77,14 +75,6 @@ def find_line(byte_offset)
7775

7876
left - 1
7977
end
80-
81-
# Find all of the newlines in the source code and return their byte offsets
82-
# from the start of the string an array.
83-
def compute_offsets(code)
84-
offsets = [0]
85-
code.b.scan("\n") { offsets << $~.end(0) }
86-
offsets
87-
end
8878
end
8979

9080
# This represents a location in the source.

templates/ext/prism/api_node.c.erb

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -36,18 +36,26 @@ pm_string_new(pm_string_t *string, rb_encoding *encoding) {
3636
return rb_enc_str_new((const char *) pm_string_source(string), pm_string_length(string), encoding);
3737
}
3838

39-
// Create a Prism::Source object from the given parser.
39+
// Create a Prism::Source object from the given parser, after pm_parse() was called.
4040
VALUE
4141
pm_source_new(pm_parser_t *parser, rb_encoding *encoding) {
42-
VALUE source = rb_enc_str_new((const char *) parser->start, parser->end - parser->start, encoding);
43-
VALUE offsets = rb_ary_new_capa(parser->newline_list.size);
42+
VALUE source_string = rb_enc_str_new((const char *) parser->start, parser->end - parser->start, encoding);
43+
VALUE source_argv[] = { source_string };
44+
VALUE source = rb_class_new_instance(1, source_argv, rb_cPrismSource);
45+
46+
pm_source_init(source, parser);
47+
return source;
48+
}
4449

50+
void
51+
pm_source_init(VALUE source, pm_parser_t *parser) {
52+
rb_funcall(source, rb_intern("start_line="), 1, LONG2NUM(parser->start_line));
53+
54+
VALUE offsets = rb_ary_new_capa(parser->newline_list.size);
4555
for (size_t index = 0; index < parser->newline_list.size; index++) {
46-
rb_ary_push(offsets, INT2FIX(parser->newline_list.offsets[index]));
56+
rb_ary_push(offsets, ULONG2NUM(parser->newline_list.offsets[index]));
4757
}
48-
49-
VALUE source_argv[] = { source, LONG2NUM(parser->start_line), offsets };
50-
return rb_class_new_instance(3, source_argv, rb_cPrismSource);
58+
rb_funcall(source, rb_intern("offsets="), 1, offsets);
5159
}
5260

5361
typedef struct pm_node_stack_node {
@@ -77,8 +85,7 @@ pm_node_stack_pop(pm_node_stack_node_t **stack) {
7785
}
7886

7987
VALUE
80-
pm_ast_new(pm_parser_t *parser, pm_node_t *node, rb_encoding *encoding) {
81-
VALUE source = pm_source_new(parser, encoding);
88+
pm_ast_new(pm_parser_t *parser, pm_node_t *node, rb_encoding *encoding, VALUE source) {
8289
ID *constants = calloc(parser->constant_pool.size, sizeof(ID));
8390

8491
for (uint32_t index = 0; index < parser->constant_pool.size; index++) {

templates/java/org/prism/Loader.java.erb

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ public class Loader {
113113
<%- end -%>
114114

115115
source.setStartLine(loadVarSInt());
116+
source.setLineOffsets(loadLineOffsets());
116117

117118
ParseResult.MagicComment[] magicComments = loadMagicComments();
118119
Nodes.Location dataLocation = loadOptionalLocation();
@@ -159,6 +160,15 @@ public class Loader {
159160
}
160161
}
161162

163+
private int[] loadLineOffsets() {
164+
int count = loadVarUInt();
165+
int[] lineOffsets = new int[count];
166+
for (int i = 0; i < count; i++) {
167+
lineOffsets[i] = loadVarUInt();
168+
}
169+
return lineOffsets;
170+
}
171+
162172
private ParseResult.MagicComment[] loadMagicComments() {
163173
int count = loadVarUInt();
164174
ParseResult.MagicComment[] magicComments = new ParseResult.MagicComment[count];

templates/java/org/prism/Nodes.java.erb

Lines changed: 5 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -42,38 +42,19 @@ public abstract class Nodes {
4242

4343
public static final class Source {
4444
public final byte[] bytes;
45-
private int startLine;
46-
private final int[] lineOffsets;
45+
private int startLine = 1;
46+
private int[] lineOffsets = null;
4747

4848
public Source(byte[] bytes) {
49-
this(bytes, 1, computeLineOffsets(bytes));
50-
}
51-
52-
public Source(byte[] bytes, int startLine, int[] lineOffsets) {
53-
assert lineOffsets[0] == 0;
54-
this.bytes = bytes;
55-
this.startLine = startLine;
56-
this.lineOffsets = lineOffsets;
49+
this.bytes = bytes;
5750
}
5851

5952
public void setStartLine(int startLine) {
6053
this.startLine = startLine;
6154
}
6255

63-
public static int[] computeLineOffsets(byte[] bytes) {
64-
int[] lineOffsets = new int[8];
65-
int lineOffsetsSize = 0;
66-
lineOffsets[lineOffsetsSize++] = 0;
67-
68-
for (int i = 0; i < bytes.length; i++) {
69-
if (bytes[i] == '\n') {
70-
if (lineOffsetsSize == lineOffsets.length) {
71-
lineOffsets = Arrays.copyOf(lineOffsets, lineOffsets.length * 2);
72-
}
73-
lineOffsets[lineOffsetsSize++] = i + 1;
74-
}
75-
}
76-
return Arrays.copyOf(lineOffsets, lineOffsetsSize);
56+
public void setLineOffsets(int[] lineOffsets) {
57+
this.lineOffsets = lineOffsets;
7758
}
7859

7960
// 1-based

templates/javascript/src/deserialize.js.erb

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,13 @@ export function deserialize(source, array) {
218218
// JavaScript.
219219
buffer.readVarInt();
220220

221+
// Skip past the line offsets, as there is no Source object yet in JavaScript.
222+
// const lineOffsets = Array.from({ length: buffer.readVarInt() }, () => buffer.readVarInt());
223+
const lineOffsetsCount = buffer.readVarInt();
224+
for (let i = 0; i < lineOffsetsCount; i ++) {
225+
buffer.readVarInt();
226+
}
227+
221228
const comments = Array.from({ length: buffer.readVarInt() }, () => ({
222229
type: buffer.readVarInt(),
223230
location: buffer.readLocation()

templates/lib/prism/serialize.rb.erb

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,10 @@ module Prism
8282
source.start_line = load_varsint
8383
end
8484

85+
def load_line_offsets
86+
source.offsets = load_varuint.times.map { load_varuint }
87+
end
88+
8589
def load_comments
8690
load_varuint.times.map do
8791
case load_varuint
@@ -118,6 +122,7 @@ module Prism
118122
tokens = load_tokens
119123
encoding = load_encoding
120124
load_start_line
125+
load_line_offsets
121126
comments, magic_comments, data_loc, errors, warnings = load_metadata
122127
tokens.each { |token,| token.value.force_encoding(encoding) }
123128

@@ -129,6 +134,7 @@ module Prism
129134
load_header
130135
load_encoding
131136
load_start_line
137+
load_line_offsets
132138

133139
comments, magic_comments, data_loc, errors, warnings = load_metadata
134140

templates/src/serialize.c.erb

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,17 @@ pm_serialize_node(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) {
128128
}
129129
}
130130

131+
static void
132+
pm_serialize_newline_list(pm_newline_list_t *list, pm_buffer_t *buffer) {
133+
uint32_t size = pm_sizet_to_u32(list->size);
134+
pm_buffer_append_varuint(buffer, size);
135+
136+
for (uint32_t i = 0; i < size; i++) {
137+
uint32_t offset = pm_sizet_to_u32(list->offsets[i]);
138+
pm_buffer_append_varuint(buffer, offset);
139+
}
140+
}
141+
131142
static void
132143
pm_serialize_comment(pm_parser_t *parser, pm_comment_t *comment, pm_buffer_t *buffer) {
133144
// serialize type
@@ -214,21 +225,27 @@ pm_serialize_encoding(const pm_encoding_t *encoding, pm_buffer_t *buffer) {
214225
pm_buffer_append_string(buffer, encoding->name, encoding_length);
215226
}
216227

217-
#line <%= __LINE__ + 1 %> "<%= File.basename(__FILE__) %>"
218-
/**
219-
* Serialize the encoding, metadata, nodes, and constant pool.
220-
*/
221-
void
222-
pm_serialize_content(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) {
228+
static void
229+
pm_serialize_metadata(pm_parser_t *parser, pm_buffer_t *buffer) {
223230
pm_serialize_encoding(parser->encoding, buffer);
224231
pm_buffer_append_varsint(buffer, parser->start_line);
232+
pm_serialize_newline_list(&parser->newline_list, buffer);
225233
<%- unless Prism::SERIALIZE_ONLY_SEMANTICS_FIELDS -%>
226234
pm_serialize_comment_list(parser, &parser->comment_list, buffer);
227235
<%- end -%>
228236
pm_serialize_magic_comment_list(parser, &parser->magic_comment_list, buffer);
229237
pm_serialize_data_loc(parser, buffer);
230238
pm_serialize_diagnostic_list(parser, &parser->error_list, buffer);
231239
pm_serialize_diagnostic_list(parser, &parser->warning_list, buffer);
240+
}
241+
242+
#line <%= __LINE__ + 1 %> "<%= File.basename(__FILE__) %>"
243+
/**
244+
* Serialize the metadata, nodes, and constant pool.
245+
*/
246+
void
247+
pm_serialize_content(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) {
248+
pm_serialize_metadata(parser, buffer);
232249

233250
// Here we're going to leave space for the offset of the constant pool in
234251
// the buffer.
@@ -319,13 +336,7 @@ pm_serialize_lex(pm_buffer_t *buffer, const uint8_t *source, size_t size, const
319336
// Append 0 to mark end of tokens.
320337
pm_buffer_append_byte(buffer, 0);
321338

322-
pm_serialize_encoding(parser.encoding, buffer);
323-
pm_buffer_append_varsint(buffer, parser.start_line);
324-
pm_serialize_comment_list(&parser, &parser.comment_list, buffer);
325-
pm_serialize_magic_comment_list(&parser, &parser.magic_comment_list, buffer);
326-
pm_serialize_data_loc(&parser, buffer);
327-
pm_serialize_diagnostic_list(&parser, &parser.error_list, buffer);
328-
pm_serialize_diagnostic_list(&parser, &parser.warning_list, buffer);
339+
pm_serialize_metadata(&parser, buffer);
329340

330341
pm_node_destroy(&parser, node);
331342
pm_parser_free(&parser);

0 commit comments

Comments
 (0)