Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement String#undump #1765

Closed
wants to merge 18 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ with all sufficient information, see the ChangeLog file or Redmine
* String#delete_suffix, String#delete_suffix! [Feature #13665]
* String#each_grapheme_cluster and String#grapheme_clusters to
enumerate grapheme clusters [Feature #13780]
* String#undump to unescape String#dump'ed string [Feature #12275]

* Struct

Expand Down
301 changes: 284 additions & 17 deletions string.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "ruby_assert.h"
#include "id.h"
#include "debug_counter.h"
#include "ruby/util.h"

#define BEG(no) (regs->beg[(no)])
#define END(no) (regs->end[(no)])
Expand Down Expand Up @@ -3422,13 +3423,34 @@ str_casecmp_p(VALUE str1, VALUE str2)
return rb_str_eql(folded_str1, folded_str2);
}

static long
strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this function is extracted from rb_strseq_index because I needed to use this from undump_after_backslash too.

const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
{
const char *search_start = str_ptr;
long pos, search_len = str_len - offset;

for (;;) {
const char *t;
pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
if (pos < 0) return pos;
t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
if (t == search_start + pos) break;
search_len -= t - search_start;
if (search_len <= 0) return -1;
offset += t - search_start;
search_start = t;
}
return pos + offset;
}

#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)

static long
rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
{
const char *str_ptr, *str_ptr_end, *sub_ptr, *search_start;
long pos, str_len, sub_len, search_len;
const char *str_ptr, *str_ptr_end, *sub_ptr;
long str_len, sub_len;
int single_byte = single_byte_optimizable(str);
rb_encoding *enc;

Expand Down Expand Up @@ -3458,21 +3480,7 @@ rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
if (sub_len == 0) return offset;

/* need proceed one character at a time */

search_start = str_ptr;
search_len = RSTRING_LEN(str) - offset;
for (;;) {
const char *t;
pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
if (pos < 0) return pos;
t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
if (t == search_start + pos) break;
search_len -= t - search_start;
if (search_len <= 0) return -1;
offset += t - search_start;
search_start = t;
}
return pos + offset;
return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
}


Expand Down Expand Up @@ -6073,6 +6081,264 @@ rb_str_dump(VALUE str)
return result;
}

enum undump_source_format {
UNDUMP_SOURCE_SIMPLE, /* "..." */
UNDUMP_SOURCE_FORCE_ENCODING, /* "...".force_encoding("...") */
UNDUMP_SOURCE_INVALID
};

static enum undump_source_format
check_undump_source_format(const char *s, const char *s_end, long len, rb_encoding *enc,
VALUE *forced_enc_str, long *forced_enc_str_len)
{
unsigned int cbeg, cend;
const char *prev;
static const long force_encoding_minimum_len = rb_strlen_lit("\"\".force_encoding(\"\")");
static const char force_encoding_middle_part[] = "\".force_encoding(\"";
static const long force_encoding_middle_part_len = rb_strlen_lit("\".force_encoding(\"");
static const char force_encoding_end_part[] = "\")";
static const long force_encoding_end_part_len = rb_strlen_lit("\")");
long pos_before_middle_part, pos_before_end_part, pos_after_middle_part;

if (len < 2) return UNDUMP_SOURCE_INVALID;

cbeg = rb_enc_mbc_to_codepoint(s, s_end, enc);
if (cbeg != '"') return UNDUMP_SOURCE_INVALID;

prev = rb_enc_prev_char(s, s_end, s_end, enc);
cend = rb_enc_mbc_to_codepoint(prev, s_end, enc);
if (cend == '"') return UNDUMP_SOURCE_SIMPLE;

if (cend != ')' || len < force_encoding_minimum_len) {
return UNDUMP_SOURCE_INVALID;
}

/* find '".force_encoding("' */
pos_before_middle_part = strseq_core(s, s_end, len,
force_encoding_middle_part, force_encoding_middle_part_len,
0, enc);
if (pos_before_middle_part <= 0) {
return UNDUMP_SOURCE_INVALID;
}

pos_after_middle_part = pos_before_middle_part + force_encoding_middle_part_len;
/* find '")' */
pos_before_end_part = strseq_core(s + pos_after_middle_part, s_end, len - pos_after_middle_part,
force_encoding_end_part, force_encoding_end_part_len,
0, enc);
if (pos_before_end_part < 0 || pos_after_middle_part + pos_before_end_part + 2 != len) {
return UNDUMP_SOURCE_INVALID;
}

*forced_enc_str_len = pos_before_end_part;
*forced_enc_str = rb_str_new(s + pos_after_middle_part, *forced_enc_str_len);
return UNDUMP_SOURCE_FORCE_ENCODING;
}

static int
unescape_ascii(unsigned int c)
{
switch (c) {
case 'n':
return '\n';
case 'r':
return '\r';
case 't':
return '\t';
case 'f':
return '\f';
case 'v':
return '\13';
case 'b':
return '\010';
case 'a':
return '\007';
case 'e':
return 033;
default:
UNREACHABLE;
}
}

static int
undump_after_backslash(VALUE undumped, const char *s, const char *s_end, rb_encoding **penc)
{
unsigned int c, c2;
int n, codelen;
size_t hexlen;
char buf[6];
static rb_encoding *enc_utf8 = NULL;

c = rb_enc_codepoint_len(s, s_end, &n, *penc);
switch (c) {
case '\\':
case '"':
case '#':
rb_str_cat(undumped, s, n); /* cat itself */
n++;
break;
case 'n':
case 'r':
case 't':
case 'f':
case 'v':
case 'b':
case 'a':
case 'e':
*buf = (char)unescape_ascii(c);
rb_str_cat(undumped, buf, n);
n++;
break;
case 'u':
if (s+1 >= s_end) {
rb_raise(rb_eRuntimeError, "invalid Unicode escape");
}
if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
if (*penc != enc_utf8) {
*penc = enc_utf8;
rb_enc_associate(undumped, enc_utf8);
ENC_CODERANGE_CLEAR(undumped);
}
c2 = rb_enc_codepoint_len(s+1, s_end, NULL, *penc);
if (c2 == '{') { /* handle \u{...} form */
const char *hexstr = s + 2;
int hex;
static const char* const close_brace = "}";
long pos;

if (hexstr >= s_end) {
rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
}
/* find close brace */
pos = strseq_core(hexstr, s_end, s_end - hexstr, close_brace, 1, 0, *penc);
if (pos < 0) {
rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
}
hex = scan_hex(hexstr, pos, &hexlen);
if (hexlen == 0 || hexlen > 6) {
rb_raise(rb_eRuntimeError, "invalid Unicode escape");
}
if (hex > 0x10ffff) {
rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
}
if ((hex & 0xfffff800) == 0xd800) {
rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
}
codelen = rb_enc_codelen(hex, *penc);
rb_enc_mbcput(hex, buf, *penc);
rb_str_cat(undumped, buf, codelen);
n += rb_strlen_lit("u{}") + hexlen;
}
else { /* handle \uXXXX form */
int hex = scan_hex(s+1, 4, &hexlen);
if (hexlen != 4) {
rb_raise(rb_eRuntimeError, "invalid Unicode escape");
}
codelen = rb_enc_codelen(hex, *penc);
rb_enc_mbcput(hex, buf, *penc);
rb_str_cat(undumped, buf, codelen);
n += rb_strlen_lit("uXXXX");
}
break;
case 'x':
if (s+1 >= s_end) {
rb_raise(rb_eRuntimeError, "invalid hex escape");
}
c2 = scan_hex(s+1, 2, &hexlen);
if (hexlen != 2) {
rb_raise(rb_eRuntimeError, "invalid hex escape");
}
*buf = (char)c2;
rb_str_cat(undumped, buf, 1L);
n += rb_strlen_lit("xXX");
break;
default:
rb_str_cat(undumped, "\\", 1L); /* keep backslash */
}

return n;
}

static VALUE rb_str_is_ascii_only_p(VALUE str);

/*
* call-seq:
* str.undump -> new_str
*
* Produces unescaped version of +str+.
* See also String#dump because String#undump does inverse of String#dump.
*
* "\"hello \\n ''\"".undump #=> "hello \n ''"
*/

static VALUE
str_undump(VALUE str)
{
const char *s = RSTRING_PTR(str);
const char *s_end = RSTRING_END(str);
long len = RSTRING_LEN(str);
rb_encoding *enc = rb_enc_get(str), *forced_enc;
int n;
unsigned int c;
enum undump_source_format source_format;
VALUE undumped = rb_enc_str_new(s, 0L, enc);
VALUE forced_enc_str;
long forced_enc_str_len;
int w;

rb_must_asciicompat(str);
if (rb_str_is_ascii_only_p(str) == Qfalse) {
rb_raise(rb_eRuntimeError, "non-ASCII character detected");
}
if (!str_null_check(str, &w)) {
rb_raise(rb_eRuntimeError, "string contains null byte");
}

source_format = check_undump_source_format(s, s_end, len, enc,
&forced_enc_str, &forced_enc_str_len);
if (source_format == UNDUMP_SOURCE_INVALID) {
rb_raise(rb_eRuntimeError, "not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
}
if (source_format == UNDUMP_SOURCE_FORCE_ENCODING) {
forced_enc = rb_find_encoding(forced_enc_str);
if (forced_enc == NULL) {
rb_raise(rb_eRuntimeError, "unknown encoding name - %"PRIsVALUE, forced_enc_str);
}
}

/* strip '"' at the start */
s++;
if (source_format == UNDUMP_SOURCE_SIMPLE) {
/* strip '"' at the end */
s_end--;
} else { /* source_format == UNDUMP_SOURCE_FORCE_ENCODING */
/* strip '".force_encoding("...")' */
s_end -= rb_strlen_lit("\".force_encoding(\"\")") + forced_enc_str_len;
}

for (; s < s_end; s += n) {
c = rb_enc_codepoint_len(s, s_end, &n, enc);
if (c == '\\') {
if (s+1 >= s_end) {
rb_raise(rb_eRuntimeError, "invalid escape");
}
n = undump_after_backslash(undumped, s+1, s_end, &enc);
}
else if (c == '"') {
rb_raise(rb_eRuntimeError, "non-escaped double quote detected");
}
else {
rb_str_cat(undumped, s, n);
}
}

if (source_format == UNDUMP_SOURCE_FORCE_ENCODING) {
rb_enc_associate(undumped, forced_enc);
ENC_CODERANGE_CLEAR(undumped);
}
OBJ_INFECT(undumped, str);
return undumped;
}

static void
rb_str_check_dummy_enc(rb_encoding *enc)
Expand Down Expand Up @@ -10586,6 +10852,7 @@ Init_String(void)
rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
rb_define_method(rb_cString, "dump", rb_str_dump, 0);
rb_define_method(rb_cString, "undump", str_undump, 0);

sym_ascii = ID2SYM(rb_intern("ascii"));
sym_turkic = ID2SYM(rb_intern("turkic"));
Expand Down
35 changes: 35 additions & 0 deletions test/ruby/test_string.rb
Original file line number Diff line number Diff line change
Expand Up @@ -753,6 +753,41 @@ def test_dump
assert_equal(S('"\\u{10ABCD}"'), b.dump)
end

def test_undump
a = S("Test") << 1 << 2 << 3 << 9 << 13 << 10
assert_equal(a, S('"Test\\x01\\x02\\x03\\t\\r\\n"').undump)
assert_equal(S("\u{7F}"), S('"\\x7F"').undump)
assert_equal(S("\u{AB}"), S('"\\u00AB"').undump)
assert_equal(S("\u{ABC}"), S('"\\u0ABC"').undump)
assert_equal(S("\uABCD"), S('"\\uABCD"').undump)
assert_equal(S("\u{ABCDE}"), S('"\\u{ABCDE}"').undump)
assert_equal(S("\u{10ABCD}"), S('"\\u{10ABCD}"').undump)

assert_equal(S("äöü"), S('"\u00E4\u00F6\u00FC"').undump)
assert_equal(S("äöü"), S('"\xC3\xA4\xC3\xB6\xC3\xBC"').undump)

assert_equal(Encoding::UTF_8, S('"\\u3042"').encode(Encoding::EUC_JP).undump.encoding)

assert_equal("abc".encode(Encoding::UTF_16LE),
'"a\x00b\x00c\x00".force_encoding("UTF-16LE")'.undump)

assert_equal('\#', '"\\\\#"'.undump)
assert_equal('\#{', '"\\\\\#{"'.undump)

assert_raise(RuntimeError) { S('\u3042').undump }
assert_raise(RuntimeError) { S('"".force_encoding()').undump }
assert_raise(RuntimeError) { S('"".force_encoding("UNKNOWN")').undump }
assert_raise(RuntimeError) { S(%("\u00E4")).undump }
assert_raise(RuntimeError) { S('""""').undump }

assert_raise(RuntimeError) { S('"\u"').undump }
assert_raise(RuntimeError) { S('"\u{"').undump }
assert_raise(RuntimeError) { S('"\u{3042"').undump }
assert_raise(RuntimeError) { S('"\x"').undump }
assert_raise(RuntimeError) { S('"\\"').undump }
assert_raise(RuntimeError) { S(%("\0")).undump }
end

def test_dup
for taint in [ false, true ]
for frozen in [ false, true ]
Expand Down