From 9a19cfd4cd1a16528cc997e3a510c3046b83cdec Mon Sep 17 00:00:00 2001 From: HASUMI Hitoshi Date: Fri, 16 Feb 2024 17:45:22 +0900 Subject: [PATCH] [Universal Parser] Reduce dependence on RArray in parse.y - Introduce `rb_parser_ary_t` structure to partly eliminate RArray from parse.y - In this patch, `parser_params->tokens` and `parser_params->ast->node_buffer->tokens` are now `rb_parser_ary_t *` - Instead, `ast_node_all_tokens()` internally creates a Ruby Array object from the `rb_parser_ary_t` - Also, delete `rb_ast_tokens()` and `rb_ast_set_tokens()` in node.c - Implement `rb_parser_str_escape()` - This is a port of the `rb_str_escape()` function in string.c - `rb_parser_str_escape()` does not depend on `VALUE` (RString) - Instead, it uses `rb_parser_stirng_t *` - This function works when --dump=y option passed - Because WIP of the universal parser, similar functions like `rb_parser_tokens_free()` exist in both node.c and parse.y. Refactoring them may be needed in some way in the future - Although we considered redesigning the structure: `ast->node_buffer->tokens` into `ast->tokens`, we leave it as it is because `rb_ast_t` is an imemo. (We will address it in the future) --- ast.c | 27 ++- node.c | 37 ++-- node.h | 4 +- parse.y | 528 ++++++++++++++++++++++++++++----------------- ruby_parser.c | 25 +++ rubyparser.h | 20 ++ universal_parser.c | 3 + 7 files changed, 429 insertions(+), 215 deletions(-) diff --git a/ast.c b/ast.c index 66e237b1f22e67..f716bb995c6246 100644 --- a/ast.c +++ b/ast.c @@ -774,10 +774,35 @@ ast_node_last_column(rb_execution_context_t *ec, VALUE self) static VALUE ast_node_all_tokens(rb_execution_context_t *ec, VALUE self) { + long i; struct ASTNodeData *data; + rb_parser_ary_t *parser_tokens; + rb_parser_ast_token_t *parser_token; + VALUE str, loc, token, all_tokens; + TypedData_Get_Struct(self, struct ASTNodeData, &rb_node_type, data); - return rb_ast_tokens(data->ast); + parser_tokens = data->ast->node_buffer->tokens; + if (parser_tokens == NULL) { + return Qnil; + } + + all_tokens = rb_ary_new2(parser_tokens->len); + for (i = 0; i < parser_tokens->len; i++) { + parser_token = parser_tokens->data[i]; + str = rb_str_new(parser_token->str->ptr, parser_token->str->len); + loc = rb_ary_new_from_args(4, + INT2FIX(parser_token->loc.beg_pos.lineno), + INT2FIX(parser_token->loc.beg_pos.column), + INT2FIX(parser_token->loc.end_pos.lineno), + INT2FIX(parser_token->loc.end_pos.column) + ); + token = rb_ary_new_from_args(4, INT2FIX(parser_token->id), ID2SYM(rb_intern(parser_token->type_name)), str, loc); + rb_ary_push(all_tokens, token); + } + rb_obj_freeze(all_tokens); + + return all_tokens; } static VALUE diff --git a/node.c b/node.c index 8011d61255b93d..5e27915e1d7016 100644 --- a/node.c +++ b/node.c @@ -69,7 +69,7 @@ rb_node_buffer_new(void) init_node_buffer_list(&nb->unmarkable, (node_buffer_elem_t*)&nb[1], ruby_xmalloc); init_node_buffer_list(&nb->markable, (node_buffer_elem_t*)((size_t)nb->unmarkable.head + bucket_size), ruby_xmalloc); nb->local_tables = 0; - nb->tokens = Qnil; + nb->tokens = 0; #ifdef UNIVERSAL_PARSER nb->config = config; #endif @@ -176,6 +176,24 @@ parser_string_free(rb_ast_t *ast, rb_parser_string_t *str) xfree(str); } +static void +parser_ast_token_free(rb_ast_t *ast, rb_parser_ast_token_t *token) +{ + if (!token) return; + parser_string_free(ast, token->str); + xfree(token); +} + +static void +parser_tokens_free(rb_ast_t *ast, rb_parser_ary_t *tokens) +{ + for (long i = 0; i < tokens->len; i++) { + parser_ast_token_free(ast, tokens->data[i]); + } + xfree(tokens->data); + xfree(tokens); +} + static void free_ast_value(rb_ast_t *ast, void *ctx, NODE *node) { @@ -228,6 +246,9 @@ free_ast_value(rb_ast_t *ast, void *ctx, NODE *node) static void rb_node_buffer_free(rb_ast_t *ast, node_buffer_t *nb) { + if (ast->node_buffer && ast->node_buffer->tokens) { + parser_tokens_free(ast, ast->node_buffer->tokens); + } iterate_node_values(ast, &nb->unmarkable, free_ast_value, NULL); node_buffer_list_free(ast, &nb->unmarkable); node_buffer_list_free(ast, &nb->markable); @@ -388,8 +409,6 @@ void rb_ast_mark_and_move(rb_ast_t *ast, bool reference_updating) { if (ast->node_buffer) { - rb_gc_mark_and_move(&ast->node_buffer->tokens); - node_buffer_t *nb = ast->node_buffer; iterate_node_values(ast, &nb->markable, mark_and_move_ast_value, NULL); @@ -438,18 +457,6 @@ rb_ast_dispose(rb_ast_t *ast) rb_ast_free(ast); } -VALUE -rb_ast_tokens(rb_ast_t *ast) -{ - return ast->node_buffer->tokens; -} - -void -rb_ast_set_tokens(rb_ast_t *ast, VALUE tokens) -{ - RB_OBJ_WRITE(ast, &ast->node_buffer->tokens, tokens); -} - VALUE rb_node_set_type(NODE *n, enum node_type t) { diff --git a/node.h b/node.h index 2e8868428aece1..371b33cff6ac35 100644 --- a/node.h +++ b/node.h @@ -40,7 +40,7 @@ struct node_buffer_struct { // - text of token // - location info // Array, whose entry is array - VALUE tokens; + rb_parser_ary_t *tokens; #ifdef UNIVERSAL_PARSER const rb_parser_config_t *config; #endif @@ -55,7 +55,6 @@ rb_ast_t *rb_ast_new(void); #endif size_t rb_ast_memsize(const rb_ast_t*); void rb_ast_dispose(rb_ast_t*); -VALUE rb_ast_tokens(rb_ast_t *ast); #if RUBY_DEBUG void rb_ast_node_type_change(NODE *n, enum node_type type); #endif @@ -65,7 +64,6 @@ void rb_node_init(NODE *n, enum node_type type); void rb_ast_mark_and_move(rb_ast_t *ast, bool reference_updating); void rb_ast_update_references(rb_ast_t*); void rb_ast_free(rb_ast_t*); -void rb_ast_set_tokens(rb_ast_t*, VALUE); NODE *rb_ast_newnode(rb_ast_t*, enum node_type type, size_t size, size_t alignment); void rb_ast_delete_node(rb_ast_t*, NODE *n); rb_ast_id_table_t *rb_ast_new_local_table(rb_ast_t*, int); diff --git a/parse.y b/parse.y index de90ee797ff058..ca77cd9abf2359 100644 --- a/parse.y +++ b/parse.y @@ -634,7 +634,7 @@ struct parser_params { /* id for terms */ int token_id; /* Array for term tokens */ - VALUE tokens; + rb_parser_ary_t *tokens; #else /* Ripper only */ @@ -875,170 +875,170 @@ peek_end_expect_token_locations(struct parser_params *p) return p->end_expect_token_locations; } -static ID -parser_token2id(struct parser_params *p, enum yytokentype tok) +static const char * +parser_token2char(struct parser_params *p, enum yytokentype tok) { switch ((int) tok) { -#define TOKEN2ID(tok) case tok: return rb_intern(#tok); -#define TOKEN2ID2(tok, name) case tok: return rb_intern(name); - TOKEN2ID2(' ', "words_sep") - TOKEN2ID2('!', "!") - TOKEN2ID2('%', "%"); - TOKEN2ID2('&', "&"); - TOKEN2ID2('*', "*"); - TOKEN2ID2('+', "+"); - TOKEN2ID2('-', "-"); - TOKEN2ID2('/', "/"); - TOKEN2ID2('<', "<"); - TOKEN2ID2('=', "="); - TOKEN2ID2('>', ">"); - TOKEN2ID2('?', "?"); - TOKEN2ID2('^', "^"); - TOKEN2ID2('|', "|"); - TOKEN2ID2('~', "~"); - TOKEN2ID2(':', ":"); - TOKEN2ID2(',', ","); - TOKEN2ID2('.', "."); - TOKEN2ID2(';', ";"); - TOKEN2ID2('`', "`"); - TOKEN2ID2('\n', "nl"); - TOKEN2ID2('{', "{"); - TOKEN2ID2('}', "}"); - TOKEN2ID2('[', "["); - TOKEN2ID2(']', "]"); - TOKEN2ID2('(', "("); - TOKEN2ID2(')', ")"); - TOKEN2ID2('\\', "backslash"); - TOKEN2ID(keyword_class); - TOKEN2ID(keyword_module); - TOKEN2ID(keyword_def); - TOKEN2ID(keyword_undef); - TOKEN2ID(keyword_begin); - TOKEN2ID(keyword_rescue); - TOKEN2ID(keyword_ensure); - TOKEN2ID(keyword_end); - TOKEN2ID(keyword_if); - TOKEN2ID(keyword_unless); - TOKEN2ID(keyword_then); - TOKEN2ID(keyword_elsif); - TOKEN2ID(keyword_else); - TOKEN2ID(keyword_case); - TOKEN2ID(keyword_when); - TOKEN2ID(keyword_while); - TOKEN2ID(keyword_until); - TOKEN2ID(keyword_for); - TOKEN2ID(keyword_break); - TOKEN2ID(keyword_next); - TOKEN2ID(keyword_redo); - TOKEN2ID(keyword_retry); - TOKEN2ID(keyword_in); - TOKEN2ID(keyword_do); - TOKEN2ID(keyword_do_cond); - TOKEN2ID(keyword_do_block); - TOKEN2ID(keyword_do_LAMBDA); - TOKEN2ID(keyword_return); - TOKEN2ID(keyword_yield); - TOKEN2ID(keyword_super); - TOKEN2ID(keyword_self); - TOKEN2ID(keyword_nil); - TOKEN2ID(keyword_true); - TOKEN2ID(keyword_false); - TOKEN2ID(keyword_and); - TOKEN2ID(keyword_or); - TOKEN2ID(keyword_not); - TOKEN2ID(modifier_if); - TOKEN2ID(modifier_unless); - TOKEN2ID(modifier_while); - TOKEN2ID(modifier_until); - TOKEN2ID(modifier_rescue); - TOKEN2ID(keyword_alias); - TOKEN2ID(keyword_defined); - TOKEN2ID(keyword_BEGIN); - TOKEN2ID(keyword_END); - TOKEN2ID(keyword__LINE__); - TOKEN2ID(keyword__FILE__); - TOKEN2ID(keyword__ENCODING__); - TOKEN2ID(tIDENTIFIER); - TOKEN2ID(tFID); - TOKEN2ID(tGVAR); - TOKEN2ID(tIVAR); - TOKEN2ID(tCONSTANT); - TOKEN2ID(tCVAR); - TOKEN2ID(tLABEL); - TOKEN2ID(tINTEGER); - TOKEN2ID(tFLOAT); - TOKEN2ID(tRATIONAL); - TOKEN2ID(tIMAGINARY); - TOKEN2ID(tCHAR); - TOKEN2ID(tNTH_REF); - TOKEN2ID(tBACK_REF); - TOKEN2ID(tSTRING_CONTENT); - TOKEN2ID(tREGEXP_END); - TOKEN2ID(tDUMNY_END); - TOKEN2ID(tSP); - TOKEN2ID(tUPLUS); - TOKEN2ID(tUMINUS); - TOKEN2ID(tPOW); - TOKEN2ID(tCMP); - TOKEN2ID(tEQ); - TOKEN2ID(tEQQ); - TOKEN2ID(tNEQ); - TOKEN2ID(tGEQ); - TOKEN2ID(tLEQ); - TOKEN2ID(tANDOP); - TOKEN2ID(tOROP); - TOKEN2ID(tMATCH); - TOKEN2ID(tNMATCH); - TOKEN2ID(tDOT2); - TOKEN2ID(tDOT3); - TOKEN2ID(tBDOT2); - TOKEN2ID(tBDOT3); - TOKEN2ID(tAREF); - TOKEN2ID(tASET); - TOKEN2ID(tLSHFT); - TOKEN2ID(tRSHFT); - TOKEN2ID(tANDDOT); - TOKEN2ID(tCOLON2); - TOKEN2ID(tCOLON3); - TOKEN2ID(tOP_ASGN); - TOKEN2ID(tASSOC); - TOKEN2ID(tLPAREN); - TOKEN2ID(tLPAREN_ARG); - TOKEN2ID(tRPAREN); - TOKEN2ID(tLBRACK); - TOKEN2ID(tLBRACE); - TOKEN2ID(tLBRACE_ARG); - TOKEN2ID(tSTAR); - TOKEN2ID(tDSTAR); - TOKEN2ID(tAMPER); - TOKEN2ID(tLAMBDA); - TOKEN2ID(tSYMBEG); - TOKEN2ID(tSTRING_BEG); - TOKEN2ID(tXSTRING_BEG); - TOKEN2ID(tREGEXP_BEG); - TOKEN2ID(tWORDS_BEG); - TOKEN2ID(tQWORDS_BEG); - TOKEN2ID(tSYMBOLS_BEG); - TOKEN2ID(tQSYMBOLS_BEG); - TOKEN2ID(tSTRING_END); - TOKEN2ID(tSTRING_DEND); - TOKEN2ID(tSTRING_DBEG); - TOKEN2ID(tSTRING_DVAR); - TOKEN2ID(tLAMBEG); - TOKEN2ID(tLABEL_END); - TOKEN2ID(tIGNORED_NL); - TOKEN2ID(tCOMMENT); - TOKEN2ID(tEMBDOC_BEG); - TOKEN2ID(tEMBDOC); - TOKEN2ID(tEMBDOC_END); - TOKEN2ID(tHEREDOC_BEG); - TOKEN2ID(tHEREDOC_END); - TOKEN2ID(k__END__); - TOKEN2ID(tLOWEST); - TOKEN2ID(tUMINUS_NUM); - TOKEN2ID(tLAST_TOKEN); -#undef TOKEN2ID -#undef TOKEN2ID2 +#define TOKEN2CHAR(tok) case tok: return (#tok); +#define TOKEN2CHAR2(tok, name) case tok: return (name); + TOKEN2CHAR2(' ', "word_sep"); + TOKEN2CHAR2('!', "!") + TOKEN2CHAR2('%', "%"); + TOKEN2CHAR2('&', "&"); + TOKEN2CHAR2('*', "*"); + TOKEN2CHAR2('+', "+"); + TOKEN2CHAR2('-', "-"); + TOKEN2CHAR2('/', "/"); + TOKEN2CHAR2('<', "<"); + TOKEN2CHAR2('=', "="); + TOKEN2CHAR2('>', ">"); + TOKEN2CHAR2('?', "?"); + TOKEN2CHAR2('^', "^"); + TOKEN2CHAR2('|', "|"); + TOKEN2CHAR2('~', "~"); + TOKEN2CHAR2(':', ":"); + TOKEN2CHAR2(',', ","); + TOKEN2CHAR2('.', "."); + TOKEN2CHAR2(';', ";"); + TOKEN2CHAR2('`', "`"); + TOKEN2CHAR2('\n', "nl"); + TOKEN2CHAR2('{', "\"{\""); + TOKEN2CHAR2('}', "\"}\""); + TOKEN2CHAR2('[', "\"[\""); + TOKEN2CHAR2(']', "\"]\""); + TOKEN2CHAR2('(', "\"(\""); + TOKEN2CHAR2(')', "\")\""); + TOKEN2CHAR2('\\', "backslash"); + TOKEN2CHAR(keyword_class); + TOKEN2CHAR(keyword_module); + TOKEN2CHAR(keyword_def); + TOKEN2CHAR(keyword_undef); + TOKEN2CHAR(keyword_begin); + TOKEN2CHAR(keyword_rescue); + TOKEN2CHAR(keyword_ensure); + TOKEN2CHAR(keyword_end); + TOKEN2CHAR(keyword_if); + TOKEN2CHAR(keyword_unless); + TOKEN2CHAR(keyword_then); + TOKEN2CHAR(keyword_elsif); + TOKEN2CHAR(keyword_else); + TOKEN2CHAR(keyword_case); + TOKEN2CHAR(keyword_when); + TOKEN2CHAR(keyword_while); + TOKEN2CHAR(keyword_until); + TOKEN2CHAR(keyword_for); + TOKEN2CHAR(keyword_break); + TOKEN2CHAR(keyword_next); + TOKEN2CHAR(keyword_redo); + TOKEN2CHAR(keyword_retry); + TOKEN2CHAR(keyword_in); + TOKEN2CHAR(keyword_do); + TOKEN2CHAR(keyword_do_cond); + TOKEN2CHAR(keyword_do_block); + TOKEN2CHAR(keyword_do_LAMBDA); + TOKEN2CHAR(keyword_return); + TOKEN2CHAR(keyword_yield); + TOKEN2CHAR(keyword_super); + TOKEN2CHAR(keyword_self); + TOKEN2CHAR(keyword_nil); + TOKEN2CHAR(keyword_true); + TOKEN2CHAR(keyword_false); + TOKEN2CHAR(keyword_and); + TOKEN2CHAR(keyword_or); + TOKEN2CHAR(keyword_not); + TOKEN2CHAR(modifier_if); + TOKEN2CHAR(modifier_unless); + TOKEN2CHAR(modifier_while); + TOKEN2CHAR(modifier_until); + TOKEN2CHAR(modifier_rescue); + TOKEN2CHAR(keyword_alias); + TOKEN2CHAR(keyword_defined); + TOKEN2CHAR(keyword_BEGIN); + TOKEN2CHAR(keyword_END); + TOKEN2CHAR(keyword__LINE__); + TOKEN2CHAR(keyword__FILE__); + TOKEN2CHAR(keyword__ENCODING__); + TOKEN2CHAR(tIDENTIFIER); + TOKEN2CHAR(tFID); + TOKEN2CHAR(tGVAR); + TOKEN2CHAR(tIVAR); + TOKEN2CHAR(tCONSTANT); + TOKEN2CHAR(tCVAR); + TOKEN2CHAR(tLABEL); + TOKEN2CHAR(tINTEGER); + TOKEN2CHAR(tFLOAT); + TOKEN2CHAR(tRATIONAL); + TOKEN2CHAR(tIMAGINARY); + TOKEN2CHAR(tCHAR); + TOKEN2CHAR(tNTH_REF); + TOKEN2CHAR(tBACK_REF); + TOKEN2CHAR(tSTRING_CONTENT); + TOKEN2CHAR(tREGEXP_END); + TOKEN2CHAR(tDUMNY_END); + TOKEN2CHAR(tSP); + TOKEN2CHAR(tUPLUS); + TOKEN2CHAR(tUMINUS); + TOKEN2CHAR(tPOW); + TOKEN2CHAR(tCMP); + TOKEN2CHAR(tEQ); + TOKEN2CHAR(tEQQ); + TOKEN2CHAR(tNEQ); + TOKEN2CHAR(tGEQ); + TOKEN2CHAR(tLEQ); + TOKEN2CHAR(tANDOP); + TOKEN2CHAR(tOROP); + TOKEN2CHAR(tMATCH); + TOKEN2CHAR(tNMATCH); + TOKEN2CHAR(tDOT2); + TOKEN2CHAR(tDOT3); + TOKEN2CHAR(tBDOT2); + TOKEN2CHAR(tBDOT3); + TOKEN2CHAR(tAREF); + TOKEN2CHAR(tASET); + TOKEN2CHAR(tLSHFT); + TOKEN2CHAR(tRSHFT); + TOKEN2CHAR(tANDDOT); + TOKEN2CHAR(tCOLON2); + TOKEN2CHAR(tCOLON3); + TOKEN2CHAR(tOP_ASGN); + TOKEN2CHAR(tASSOC); + TOKEN2CHAR(tLPAREN); + TOKEN2CHAR(tLPAREN_ARG); + TOKEN2CHAR(tRPAREN); + TOKEN2CHAR(tLBRACK); + TOKEN2CHAR(tLBRACE); + TOKEN2CHAR(tLBRACE_ARG); + TOKEN2CHAR(tSTAR); + TOKEN2CHAR(tDSTAR); + TOKEN2CHAR(tAMPER); + TOKEN2CHAR(tLAMBDA); + TOKEN2CHAR(tSYMBEG); + TOKEN2CHAR(tSTRING_BEG); + TOKEN2CHAR(tXSTRING_BEG); + TOKEN2CHAR(tREGEXP_BEG); + TOKEN2CHAR(tWORDS_BEG); + TOKEN2CHAR(tQWORDS_BEG); + TOKEN2CHAR(tSYMBOLS_BEG); + TOKEN2CHAR(tQSYMBOLS_BEG); + TOKEN2CHAR(tSTRING_END); + TOKEN2CHAR(tSTRING_DEND); + TOKEN2CHAR(tSTRING_DBEG); + TOKEN2CHAR(tSTRING_DVAR); + TOKEN2CHAR(tLAMBEG); + TOKEN2CHAR(tLABEL_END); + TOKEN2CHAR(tIGNORED_NL); + TOKEN2CHAR(tCOMMENT); + TOKEN2CHAR(tEMBDOC_BEG); + TOKEN2CHAR(tEMBDOC); + TOKEN2CHAR(tEMBDOC_END); + TOKEN2CHAR(tHEREDOC_BEG); + TOKEN2CHAR(tHEREDOC_END); + TOKEN2CHAR(k__END__); + TOKEN2CHAR(tLOWEST); + TOKEN2CHAR(tUMINUS_NUM); + TOKEN2CHAR(tLAST_TOKEN); +#undef TOKEN2CHAR +#undef TOKEN2CHAR2 } rb_bug("parser_token2id: unknown token %d", tok); @@ -2565,8 +2565,8 @@ rb_parser_str_resize(struct parser_params *p, rb_parser_string_t *str, long len) return str; } -#ifndef UNIVERSAL_PARSER #ifndef RIPPER +#ifndef UNIVERSAL_PARSER # define PARSER_ENC_STRING_GETMEM(str, ptrvar, lenvar, encvar) \ ((ptrvar) = str->ptr, \ (lenvar) = str->len, \ @@ -2587,7 +2587,73 @@ rb_parser_string_hash_cmp(rb_parser_string_t *str1, rb_parser_string_t *str2) memcmp(ptr1, ptr2, len1) != 0); } #endif -#endif + +static void +rb_parser_ary_extend(rb_parser_t *p, rb_parser_ary_t *ary, long len) +{ + long i; + if (ary->capa < len) { + ary->capa = len; + ary->data = xrealloc(ary->data, sizeof(void *) * len); + for (i = ary->len; i < len; i++) { + ary->data[i] = 0; + } + } +} + +static rb_parser_ary_t * +rb_parser_ary_new_capa(rb_parser_t *p, long len) +{ + if (len < 0) { + rb_bug("negative array size (or size too big): %ld", len); + } + rb_parser_ary_t *ary = xcalloc(1, sizeof(rb_parser_ary_t)); + ary->len = 0; + ary->capa = len; + if (0 < len) { + ary->data = (rb_parser_ast_token_t **)xcalloc(len, sizeof(rb_parser_ast_token_t *)); + } + else { + ary->data = NULL; + } + return ary; +} +#define rb_parser_ary_new2 rb_parser_ary_new_capa + +inline static rb_parser_ary_t * +rb_parser_ary_new(rb_parser_t *p) +{ + return rb_parser_ary_new_capa(p, 0); +} + +static rb_parser_ary_t * +rb_parser_ary_push(rb_parser_t *p, rb_parser_ary_t *ary, rb_parser_ast_token_t *val) +{ + if (ary->len == ary->capa) { + rb_parser_ary_extend(p, ary, ary->len == 0 ? 1 : ary->len * 2); + } + ary->data[ary->len++] = val; + return ary; +} + +static void +rb_parser_ast_token_free(rb_parser_t *p, rb_parser_ast_token_t *token) +{ + if (!token) return; + rb_parser_string_free(p, token->str); + xfree(token); +} + +static void +rb_parser_tokens_free(rb_parser_t *p, rb_parser_ary_t *tokens) +{ + for (long i = 0; i < tokens->len; i++) { + rb_parser_ast_token_free(p, tokens->data[i]); + } + xfree(tokens); +} + +#endif /* !RIPPER */ %} %expect 0 @@ -7035,35 +7101,100 @@ parser_has_token(struct parser_params *p) return pcur > ptok; } -static VALUE -code_loc_to_ary(struct parser_params *p, const rb_code_location_t *loc) +static const char * +escaped_char(int c) { - VALUE ary = rb_ary_new_from_args(4, - INT2NUM(loc->beg_pos.lineno), INT2NUM(loc->beg_pos.column), - INT2NUM(loc->end_pos.lineno), INT2NUM(loc->end_pos.column)); - rb_obj_freeze(ary); - - return ary; + switch (c) { + case '"': return "\\\""; + case '\\': return "\\\\"; + case '\0': return "\\0"; + case '\n': return "\\n"; + case '\r': return "\\r"; + case '\t': return "\\t"; + case '\f': return "\\f"; + case '\013': return "\\v"; + case '\010': return "\\b"; + case '\007': return "\\a"; + case '\033': return "\\e"; + case '\x7f': return "\\c?"; + } + return NULL; } -static void -parser_append_tokens(struct parser_params *p, VALUE str, enum yytokentype t, int line) +static rb_parser_string_t * +rb_parser_str_escape(struct parser_params *p, rb_parser_string_t *str) { - VALUE ary; - int token_id; + rb_encoding *enc = p->enc; + const char *ptr = str->ptr; + const char *pend = ptr + str->len; + const char *prev = ptr; + char charbuf[5] = {'\\', 'x', 0, 0, 0}; + rb_parser_string_t * result = rb_parser_string_new(p, 0, 0); + int asciicompat = rb_enc_asciicompat(enc); + + while (ptr < pend) { + unsigned int c; + const char *cc; + int n = rb_enc_precise_mbclen(ptr, pend, enc); + if (!MBCLEN_CHARFOUND_P(n)) { + if (ptr > prev) rb_parser_str_buf_cat(p, result, prev, ptr - prev); + n = rb_enc_mbminlen(enc); + if (pend < ptr + n) + n = (int)(pend - ptr); + while (n--) { + c = *ptr & 0xf0 >> 4; + charbuf[2] = (c < 10) ? '0' + c : 'A' + c - 10; + c = *ptr & 0x0f; + charbuf[3] = (c < 10) ? '0' + c : 'A' + c - 10; + rb_parser_str_buf_cat(p, result, charbuf, 4); + prev = ++ptr; + } + continue; + } + n = MBCLEN_CHARFOUND_LEN(n); + c = rb_enc_mbc_to_codepoint(ptr, pend, enc); + ptr += n; + cc = escaped_char(c); + if (cc) { + if (ptr - n > prev) rb_parser_str_buf_cat(p, result, prev, ptr - n - prev); + rb_parser_str_buf_cat(p, result, cc, strlen(cc)); + prev = ptr; + } + else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) { + } + else { + if (ptr - n > prev) { + rb_parser_str_buf_cat(p, result, prev, ptr - n - prev); + prev = ptr - n; + } + rb_parser_str_buf_cat(p, result, prev, ptr - prev); + prev = ptr; + } + } + if (ptr > prev) rb_parser_str_buf_cat(p, result, prev, ptr - prev); - ary = rb_ary_new2(4); - token_id = p->token_id; - rb_ary_push(ary, INT2FIX(token_id)); - rb_ary_push(ary, ID2SYM(parser_token2id(p, t))); - rb_ary_push(ary, str); - rb_ary_push(ary, code_loc_to_ary(p, p->yylloc)); - rb_obj_freeze(ary); - rb_ary_push(p->tokens, ary); + return result; +} + +static void +parser_append_tokens(struct parser_params *p, rb_parser_string_t *str, enum yytokentype t, int line) +{ + rb_parser_ast_token_t *token = xcalloc(1, sizeof(rb_parser_ast_token_t)); + token->id = p->token_id; + token->type_name = parser_token2char(p, t); + token->str = str; + token->loc.beg_pos = p->yylloc->beg_pos; + token->loc.end_pos = p->yylloc->end_pos; + rb_parser_ary_push(p, p->tokens, token); p->token_id++; if (p->debug) { - rb_parser_printf(p, "Append tokens (line: %d) %"PRIsVALUE"\n", line, ary); + rb_parser_string_t *str_escaped = rb_parser_str_escape(p, str); + rb_parser_printf(p, "Append tokens (line: %d) [%d, :%s, \"%s\", [%d, %d, %d, %d]]\n", + line, token->id, token->type_name, str_escaped->ptr, + token->loc.beg_pos.lineno, token->loc.beg_pos.column, + token->loc.end_pos.lineno, token->loc.end_pos.column); + rb_parser_string_free(p, str_escaped); } } @@ -7077,7 +7208,7 @@ parser_dispatch_scan_event(struct parser_params *p, enum yytokentype t, int line RUBY_SET_YYLLOC(*p->yylloc); if (p->keep_tokens) { - VALUE str = STR_NEW(p->lex.ptok, p->lex.pcur - p->lex.ptok); + rb_parser_string_t *str = rb_parser_encoding_string_new(p, p->lex.ptok, p->lex.pcur - p->lex.ptok, p->enc); parser_append_tokens(p, str, t, line); } @@ -7095,7 +7226,8 @@ parser_dispatch_delayed_token(struct parser_params *p, enum yytokentype t, int l RUBY_SET_YYLLOC_OF_DELAYED_TOKEN(*p->yylloc); if (p->keep_tokens) { - parser_append_tokens(p, p->delayed.token, t, line); + rb_parser_string_t *str = rb_str_to_parser_string(p, p->delayed.token); + parser_append_tokens(p, str, t, line); } p->delayed.token = Qnil; @@ -7607,7 +7739,7 @@ yycompile0(VALUE arg) tree = NEW_NIL(&NULL_LOC); } else { - VALUE tokens = p->tokens; + rb_parser_ary_t *tokens = p->tokens; NODE *prelude; NODE *body = parser_append_options(p, RNODE_SCOPE(tree)->nd_body); prelude = block_append(p, p->eval_tree_begin, body); @@ -7615,8 +7747,8 @@ yycompile0(VALUE arg) p->ast->body.frozen_string_literal = p->frozen_string_literal; p->ast->body.coverage_enabled = cov; if (p->keep_tokens) { - rb_obj_freeze(tokens); - rb_ast_set_tokens(p->ast, tokens); + p->ast->node_buffer->tokens = tokens; + p->tokens = NULL; } } p->ast->body.root = tree; @@ -9230,7 +9362,7 @@ parser_dispatch_heredoc_end(struct parser_params *p, int line) dispatch_delayed_token(p, tSTRING_CONTENT); if (p->keep_tokens) { - VALUE str = STR_NEW(p->lex.ptok, p->lex.pend - p->lex.ptok); + rb_parser_string_t *str = rb_parser_encoding_string_new(p, p->lex.ptok, p->lex.pend - p->lex.ptok, p->enc); RUBY_SET_YYLLOC_OF_HEREDOC_END(*p->yylloc); parser_append_tokens(p, str, tHEREDOC_END, line); } @@ -15973,7 +16105,7 @@ parser_initialize(struct parser_params *p) p->error_buffer = Qfalse; p->end_expect_token_locations = NULL; p->token_id = 0; - p->tokens = Qnil; + p->tokens = NULL; #else p->result = Qnil; p->parsing_thread = Qnil; @@ -16006,7 +16138,6 @@ rb_ruby_parser_mark(void *ptr) #ifndef RIPPER rb_gc_mark(p->debug_lines); rb_gc_mark(p->error_buffer); - rb_gc_mark(p->tokens); #else rb_gc_mark(p->value); rb_gc_mark(p->result); @@ -16028,6 +16159,12 @@ rb_ruby_parser_free(void *ptr) struct parser_params *p = (struct parser_params*)ptr; struct local_vars *local, *prev; +#ifndef RIPPER + if (p->tokens) { + rb_parser_tokens_free(p, p->tokens); + } +#endif + if (p->tokenbuf) { ruby_sized_xfree(p->tokenbuf, p->toksiz); } @@ -16145,8 +16282,7 @@ void rb_ruby_parser_keep_tokens(rb_parser_t *p) { p->keep_tokens = 1; - // TODO - p->tokens = rb_ary_new(); + p->tokens = rb_parser_ary_new_capa(p, 10); } #ifndef UNIVERSAL_PARSER diff --git a/ruby_parser.c b/ruby_parser.c index 8e2371fd1d92dd..d37dc388cd28df 100644 --- a/ruby_parser.c +++ b/ruby_parser.c @@ -461,6 +461,27 @@ str_coderange_scan_restartable(const char *s, const char *e, void *enc, int *cr) return rb_str_coderange_scan_restartable(s, e, (rb_encoding *)enc, cr); } +static int +enc_mbminlen(void *enc) +{ + return rb_enc_mbminlen((rb_encoding *)enc); +} + +static bool +enc_isascii(OnigCodePoint c, void *enc) +{ + return rb_enc_isascii(c, (rb_encoding *)enc); +} + +static OnigCodePoint +enc_mbc_to_codepoint(const char *p, const char *e, void *enc) +{ + const OnigUChar *up = RBIMPL_CAST((const OnigUChar *)p); + const OnigUChar *ue = RBIMPL_CAST((const OnigUChar *)e); + + return ONIGENC_MBC_TO_CODE((rb_encoding *)enc, up, ue); +} + VALUE rb_io_gets_internal(VALUE io); extern VALUE rb_eArgError; extern VALUE rb_mRubyVMFrozenCore; @@ -596,6 +617,10 @@ static const rb_parser_config_t rb_global_parser_config = { .encoding_set = encoding_set, .encoding_is_ascii8bit = encoding_is_ascii8bit, .usascii_encoding = usascii_encoding, + .enc_coderange_broken = ENC_CODERANGE_BROKEN, + .enc_mbminlen = enc_mbminlen, + .enc_isascii = enc_isascii, + .enc_mbc_to_codepoint = enc_mbc_to_codepoint, .ractor_make_shareable = rb_ractor_make_shareable, diff --git a/rubyparser.h b/rubyparser.h index 34ee117f650974..cf40ad970ece18 100644 --- a/rubyparser.h +++ b/rubyparser.h @@ -189,6 +189,22 @@ typedef struct rb_code_location_struct { rb_code_position_t end_pos; } rb_code_location_t; +typedef struct rb_parser_ast_token { + int id; + const char *type_name; + rb_parser_string_t *str; + rb_code_location_t loc; +} rb_parser_ast_token_t; + +/* + * Array-like object for parser + */ +typedef struct rb_parser_ary { + rb_parser_ast_token_t **data; + long len; // current size + long capa; // capacity +} rb_parser_ary_t; + /* Header part of AST Node */ typedef struct RNode { VALUE flags; @@ -1340,6 +1356,10 @@ typedef struct rb_parser_config_struct { void (*encoding_set)(VALUE obj, int encindex); int (*encoding_is_ascii8bit)(VALUE obj); rb_encoding *(*usascii_encoding)(void); + int enc_coderange_broken; + int (*enc_mbminlen)(rb_encoding *enc); + bool (*enc_isascii)(OnigCodePoint c, rb_encoding *enc); + OnigCodePoint (*enc_mbc_to_codepoint)(const char *p, const char *e, rb_encoding *enc); /* Ractor */ VALUE (*ractor_make_shareable)(VALUE obj); diff --git a/universal_parser.c b/universal_parser.c index 2cd5c7407badc5..08fdfe5b4a8008 100644 --- a/universal_parser.c +++ b/universal_parser.c @@ -293,6 +293,9 @@ struct rb_imemo_tmpbuf_struct { #define rb_mRubyVMFrozenCore p->config->mRubyVMFrozenCore() #undef rb_long2int #define rb_long2int p->config->long2int +#define rb_enc_mbminlen p->config->enc_mbminlen +#define rb_enc_isascii p->config->enc_isascii +#define rb_enc_mbc_to_codepoint p->config->enc_mbc_to_codepoint #define rb_node_case_when_optimizable_literal p->config->node_case_when_optimizable_literal