Serialize the newline_list to avoid recomputing it again later

eregon · eregon · commit 4eaaa9011488 · 2024-02-13T17:31:56.000+01:00
* Fixes #2380
diff --git a/docs/serialization.md b/docs/serialization.md
@@ -85,6 +85,8 @@ The header is structured like the following table:
 | `1` | 1 indicates only semantics fields were serialized, 0 indicates all fields were serialized (including location fields) |
 | string | the encoding name |
 | varsint | the start line |
+| varuint | number of newline offsets |
+| varuint* | newline offsets |
 | varuint | number of comments |
 | comment* | comments |
 | varuint | number of magic comments |
diff --git a/ext/prism/extension.c b/ext/prism/extension.c
@@ -542,9 +542,9 @@ parse_lex_input(pm_string_t *input, const pm_options_t *options, bool return_nod
     pm_parser_init(&parser, pm_string_source(input), pm_string_length(input), options);
     pm_parser_register_encoding_changed_callback(&parser, parse_lex_encoding_changed_callback);
 
-    VALUE offsets = rb_ary_new();
-    VALUE source_argv[] = { rb_str_new((const char *) pm_string_source(input), pm_string_length(input)), ULONG2NUM(parser.start_line), offsets };
-    VALUE source = rb_class_new_instance(3, source_argv, rb_cPrismSource);
+    VALUE source_string = rb_str_new((const char *) pm_string_source(input), pm_string_length(input));
+    VALUE source_argv[] = { source_string };
+    VALUE source = rb_class_new_instance(1, source_argv, rb_cPrismSource);
 
     parse_lex_data_t parse_lex_data = {
         .source = source,
@@ -561,17 +561,18 @@ parse_lex_input(pm_string_t *input, const pm_options_t *options, bool return_nod
     parser.lex_callback = &lex_callback;
     pm_node_t *node = pm_parse(&parser);
 
-    // Here we need to update the source range to have the correct newline
-    // offsets. We do it here because we've already created the object and given
-    // it over to all of the tokens.
-    for (size_t index = 0; index < parser.newline_list.size; index++) {
-        rb_ary_push(offsets, INT2FIX(parser.newline_list.offsets[index]));
-    }
+    // Here we need to update the Source object to have the correct
+    // encoding for the source string and the correct newline offsets.
+    // We do it here because we've already created the Source object and given
+    // it over to all of the tokens, and both of these are only set after pm_parse().
+    rb_encoding *encoding = rb_enc_find(parser.encoding->name);
+    rb_enc_associate(source_string, encoding);
+    pm_source_init(source, &parser);
 
     VALUE value;
     if (return_nodes) {
         value = rb_ary_new_capa(2);
-        rb_ary_push(value, pm_ast_new(&parser, node, parse_lex_data.encoding));
+        rb_ary_push(value, pm_ast_new(&parser, node, parse_lex_data.encoding, source));
         rb_ary_push(value, parse_lex_data.tokens);
     } else {
         value = parse_lex_data.tokens;
@@ -650,7 +651,7 @@ parse_input(pm_string_t *input, const pm_options_t *options) {
 
     VALUE source = pm_source_new(&parser, encoding);
     VALUE result_argv[] = {
-        pm_ast_new(&parser, node, encoding),
+        pm_ast_new(&parser, node, encoding, source),
         parser_comments(&parser, source),
         parser_magic_comments(&parser, source),
         parser_data_loc(&parser, source),
diff --git a/ext/prism/extension.h b/ext/prism/extension.h
@@ -8,8 +8,9 @@
 #include "prism.h"
 
 VALUE pm_source_new(pm_parser_t *parser, rb_encoding *encoding);
+void pm_source_init(VALUE source, pm_parser_t *parser);
 VALUE pm_token_new(pm_parser_t *parser, pm_token_t *token, rb_encoding *encoding, VALUE source);
-VALUE pm_ast_new(pm_parser_t *parser, pm_node_t *node, rb_encoding *encoding);
+VALUE pm_ast_new(pm_parser_t *parser, pm_node_t *node, rb_encoding *encoding, VALUE source);
 
 void Init_prism_api_node(void);
 void Init_prism_pack(void);
diff --git a/lib/prism/parse_result.rb b/lib/prism/parse_result.rb
@@ -12,15 +12,13 @@ class Source
     attr_accessor :start_line
 
     # The list of newline byte offsets in the source code.
-    attr_reader :offsets
+    attr_accessor :offsets
 
-    # Create a new source object with the given source code and newline byte
-    # offsets. If no newline byte offsets are given, they will be computed from
-    # the source code.
-    def initialize(source, start_line = 1, offsets = compute_offsets(source))
+    # Create a new source object with the given source code.
+    def initialize(source)
       @source = source
-      @start_line = start_line
-      @offsets = offsets
+      @start_line = 1 # set after parsing is done
+      @offsets = [] # set after parsing is done
     end
 
     # Perform a byteslice on the source code using the given byte offset and
@@ -77,14 +75,6 @@ def find_line(byte_offset)
 
       left - 1
     end
-
-    # Find all of the newlines in the source code and return their byte offsets
-    # from the start of the string an array.
-    def compute_offsets(code)
-      offsets = [0]
-      code.b.scan("\n") { offsets << $~.end(0) }
-      offsets
-    end
   end
 
   # This represents a location in the source.
diff --git a/templates/ext/prism/api_node.c.erb b/templates/ext/prism/api_node.c.erb
@@ -36,18 +36,26 @@ pm_string_new(pm_string_t *string, rb_encoding *encoding) {
     return rb_enc_str_new((const char *) pm_string_source(string), pm_string_length(string), encoding);
 }
 
-// Create a Prism::Source object from the given parser.
+// Create a Prism::Source object from the given parser, after pm_parse() was called.
 VALUE
 pm_source_new(pm_parser_t *parser, rb_encoding *encoding) {
-    VALUE source = rb_enc_str_new((const char *) parser->start, parser->end - parser->start, encoding);
-    VALUE offsets = rb_ary_new_capa(parser->newline_list.size);
+    VALUE source_string = rb_enc_str_new((const char *) parser->start, parser->end - parser->start, encoding);
+    VALUE source_argv[] = { source_string };
+    VALUE source = rb_class_new_instance(1, source_argv, rb_cPrismSource);
+
+    pm_source_init(source, parser);
+    return source;
+}
 
+void
+pm_source_init(VALUE source, pm_parser_t *parser) {
+    rb_funcall(source, rb_intern("start_line="), 1, LONG2NUM(parser->start_line));
+
+    VALUE offsets = rb_ary_new_capa(parser->newline_list.size);
     for (size_t index = 0; index < parser->newline_list.size; index++) {
-        rb_ary_push(offsets, INT2FIX(parser->newline_list.offsets[index]));
+        rb_ary_push(offsets, ULONG2NUM(parser->newline_list.offsets[index]));
     }
-
-    VALUE source_argv[] = { source, LONG2NUM(parser->start_line), offsets };
-    return rb_class_new_instance(3, source_argv, rb_cPrismSource);
+    rb_funcall(source, rb_intern("offsets="), 1, offsets);
 }
 
 typedef struct pm_node_stack_node {
@@ -77,8 +85,7 @@ pm_node_stack_pop(pm_node_stack_node_t **stack) {
 }
 
 VALUE
-pm_ast_new(pm_parser_t *parser, pm_node_t *node, rb_encoding *encoding) {
-    VALUE source = pm_source_new(parser, encoding);
+pm_ast_new(pm_parser_t *parser, pm_node_t *node, rb_encoding *encoding, VALUE source) {
     ID *constants = calloc(parser->constant_pool.size, sizeof(ID));
 
     for (uint32_t index = 0; index < parser->constant_pool.size; index++) {
diff --git a/templates/java/org/prism/Loader.java.erb b/templates/java/org/prism/Loader.java.erb
@@ -113,6 +113,7 @@ public class Loader {
         <%- end -%>
 
         source.setStartLine(loadVarSInt());
+        source.setLineOffsets(loadLineOffsets());
 
         ParseResult.MagicComment[] magicComments = loadMagicComments();
         Nodes.Location dataLocation = loadOptionalLocation();
@@ -159,6 +160,15 @@ public class Loader {
         }
     }
 
+    private int[] loadLineOffsets() {
+        int count = loadVarUInt();
+        int[] lineOffsets = new int[count];
+        for (int i = 0; i < count; i++) {
+            lineOffsets[i] = loadVarUInt();
+        }
+        return lineOffsets;
+    }
+
     private ParseResult.MagicComment[] loadMagicComments() {
         int count = loadVarUInt();
         ParseResult.MagicComment[] magicComments = new ParseResult.MagicComment[count];
diff --git a/templates/java/org/prism/Nodes.java.erb b/templates/java/org/prism/Nodes.java.erb
@@ -42,38 +42,19 @@ public abstract class Nodes {
 
     public static final class Source {
         public final byte[] bytes;
-        private int startLine;
-        private final int[] lineOffsets;
+        private int startLine = 1;
+        private int[] lineOffsets = null;
 
         public Source(byte[] bytes) {
-            this(bytes, 1, computeLineOffsets(bytes));
-        }
-
-        public Source(byte[] bytes, int startLine, int[] lineOffsets) {
-            assert lineOffsets[0] == 0;
-            this.bytes = bytes;
-            this.startLine = startLine;
-            this.lineOffsets = lineOffsets;
+          this.bytes = bytes;
         }
 
         public void setStartLine(int startLine) {
             this.startLine = startLine;
         }
 
-        public static int[] computeLineOffsets(byte[] bytes) {
-            int[] lineOffsets = new int[8];
-            int lineOffsetsSize = 0;
-            lineOffsets[lineOffsetsSize++] = 0;
-
-            for (int i = 0; i < bytes.length; i++) {
-                if (bytes[i] == '\n') {
-                    if (lineOffsetsSize == lineOffsets.length) {
-                        lineOffsets = Arrays.copyOf(lineOffsets, lineOffsets.length * 2);
-                    }
-                    lineOffsets[lineOffsetsSize++] = i + 1;
-                }
-            }
-            return Arrays.copyOf(lineOffsets, lineOffsetsSize);
+        public void setLineOffsets(int[] lineOffsets) {
+            this.lineOffsets = lineOffsets;
         }
 
         // 1-based
diff --git a/templates/javascript/src/deserialize.js.erb b/templates/javascript/src/deserialize.js.erb
@@ -218,6 +218,13 @@ export function deserialize(source, array) {
   // JavaScript.
   buffer.readVarInt();
 
+  // Skip past the line offsets, as there is no Source object yet in JavaScript.
+  // const lineOffsets = Array.from({ length: buffer.readVarInt() }, () => buffer.readVarInt());
+  const lineOffsetsCount = buffer.readVarInt();
+  for (let i = 0; i < lineOffsetsCount; i ++) {
+    buffer.readVarInt();
+  }
+
   const comments = Array.from({ length: buffer.readVarInt() }, () => ({
     type: buffer.readVarInt(),
     location: buffer.readLocation()
diff --git a/templates/lib/prism/serialize.rb.erb b/templates/lib/prism/serialize.rb.erb
@@ -82,6 +82,10 @@ module Prism
         source.start_line = load_varsint
       end
 
+      def load_line_offsets
+        source.offsets = load_varuint.times.map { load_varuint }
+      end
+
       def load_comments
         load_varuint.times.map do
           case load_varuint
@@ -118,6 +122,7 @@ module Prism
         tokens = load_tokens
         encoding = load_encoding
         load_start_line
+        load_line_offsets
         comments, magic_comments, data_loc, errors, warnings = load_metadata
         tokens.each { |token,| token.value.force_encoding(encoding) }
 
@@ -129,6 +134,7 @@ module Prism
         load_header
         load_encoding
         load_start_line
+        load_line_offsets
 
         comments, magic_comments, data_loc, errors, warnings = load_metadata
 
diff --git a/templates/src/serialize.c.erb b/templates/src/serialize.c.erb
@@ -128,6 +128,17 @@ pm_serialize_node(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) {
     }
 }
 
+static void
+pm_serialize_newline_list(pm_newline_list_t *list, pm_buffer_t *buffer) {
+    uint32_t size = pm_sizet_to_u32(list->size);
+    pm_buffer_append_varuint(buffer, size);
+
+    for (uint32_t i = 0; i < size; i++) {
+        uint32_t offset = pm_sizet_to_u32(list->offsets[i]);
+        pm_buffer_append_varuint(buffer, offset);
+    }
+}
+
 static void
 pm_serialize_comment(pm_parser_t *parser, pm_comment_t *comment, pm_buffer_t *buffer) {
     // serialize type
@@ -214,21 +225,27 @@ pm_serialize_encoding(const pm_encoding_t *encoding, pm_buffer_t *buffer) {
     pm_buffer_append_string(buffer, encoding->name, encoding_length);
 }
 
-#line <%= __LINE__ + 1 %> "<%= File.basename(__FILE__) %>"
-/**
- * Serialize the encoding, metadata, nodes, and constant pool.
- */
-void
-pm_serialize_content(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) {
+static void
+pm_serialize_metadata(pm_parser_t *parser, pm_buffer_t *buffer) {
     pm_serialize_encoding(parser->encoding, buffer);
     pm_buffer_append_varsint(buffer, parser->start_line);
+    pm_serialize_newline_list(&parser->newline_list, buffer);
 <%- unless Prism::SERIALIZE_ONLY_SEMANTICS_FIELDS -%>
     pm_serialize_comment_list(parser, &parser->comment_list, buffer);
 <%- end -%>
     pm_serialize_magic_comment_list(parser, &parser->magic_comment_list, buffer);
     pm_serialize_data_loc(parser, buffer);
     pm_serialize_diagnostic_list(parser, &parser->error_list, buffer);
     pm_serialize_diagnostic_list(parser, &parser->warning_list, buffer);
+}
+
+#line <%= __LINE__ + 1 %> "<%= File.basename(__FILE__) %>"
+/**
+ * Serialize the metadata, nodes, and constant pool.
+ */
+void
+pm_serialize_content(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) {
+    pm_serialize_metadata(parser, buffer);
 
     // Here we're going to leave space for the offset of the constant pool in
     // the buffer.
@@ -319,13 +336,7 @@ pm_serialize_lex(pm_buffer_t *buffer, const uint8_t *source, size_t size, const
     // Append 0 to mark end of tokens.
     pm_buffer_append_byte(buffer, 0);
 
-    pm_serialize_encoding(parser.encoding, buffer);
-    pm_buffer_append_varsint(buffer, parser.start_line);
-    pm_serialize_comment_list(&parser, &parser.comment_list, buffer);
-    pm_serialize_magic_comment_list(&parser, &parser.magic_comment_list, buffer);
-    pm_serialize_data_loc(&parser, buffer);
-    pm_serialize_diagnostic_list(&parser, &parser.error_list, buffer);
-    pm_serialize_diagnostic_list(&parser, &parser.warning_list, buffer);
+    pm_serialize_metadata(&parser, buffer);
 
     pm_node_destroy(&parser, node);
     pm_parser_free(&parser);