Skip to content

string.c: Directly create strings with the correct encoding #12076

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions encoding.c
Original file line number Diff line number Diff line change
Expand Up @@ -967,6 +967,21 @@ enc_set_index(VALUE obj, int idx)
rb_ivar_set(obj, rb_id_encoding(), INT2NUM(idx));
}

void
rb_enc_raw_set(VALUE obj, rb_encoding *enc)
{
RUBY_ASSERT(enc_capable(obj));

int idx = enc ? ENC_TO_ENCINDEX(enc) : 0;

if (idx < ENCODING_INLINE_MAX) {
ENCODING_SET_INLINED(obj, idx);
return;
}
ENCODING_SET_INLINED(obj, ENCODING_INLINE_MAX);
rb_ivar_set(obj, rb_id_encoding(), INT2NUM(idx));
}

void
rb_enc_set_index(VALUE obj, int idx)
{
Expand Down
2 changes: 2 additions & 0 deletions internal/encoding.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ int rb_encdb_dummy(const char *name);
void rb_encdb_declare(const char *name);
void rb_enc_set_base(const char *name, const char *orig);
int rb_enc_set_dummy(int index);
void rb_enc_raw_set(VALUE obj, rb_encoding *enc);

PUREFUNC(int rb_data_is_encoding(VALUE obj));

/* vm.c */
Expand Down
69 changes: 29 additions & 40 deletions string.c
Original file line number Diff line number Diff line change
Expand Up @@ -988,20 +988,26 @@ empty_str_alloc(VALUE klass)
}

static VALUE
str_new0(VALUE klass, const char *ptr, long len, int termlen)
str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
{
VALUE str;

if (len < 0) {
rb_raise(rb_eArgError, "negative string size (or size too big)");
}

if (enc == NULL) {
enc = rb_ascii8bit_encoding();
}

RUBY_DTRACE_CREATE_HOOK(STRING, len);

int termlen = rb_enc_mbminlen(enc);

if (STR_EMBEDDABLE_P(len, termlen)) {
str = str_alloc_embed(klass, len + termlen);
if (len == 0) {
ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
}
}
else {
Expand All @@ -1013,9 +1019,13 @@ str_new0(VALUE klass, const char *ptr, long len, int termlen)
RSTRING(str)->as.heap.ptr =
rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
}

rb_enc_raw_set(str, enc);

if (ptr) {
memcpy(RSTRING_PTR(str), ptr, len);
}

STR_SET_LEN(str, len);
TERM_FILL(RSTRING_PTR(str) + len, termlen);
return str;
Expand All @@ -1024,7 +1034,7 @@ str_new0(VALUE klass, const char *ptr, long len, int termlen)
static VALUE
str_new(VALUE klass, const char *ptr, long len)
{
return str_new0(klass, ptr, len, 1);
return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
}

VALUE
Expand All @@ -1036,29 +1046,19 @@ rb_str_new(const char *ptr, long len)
VALUE
rb_usascii_str_new(const char *ptr, long len)
{
VALUE str = rb_str_new(ptr, len);
ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
return str;
return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
}

VALUE
rb_utf8_str_new(const char *ptr, long len)
{
VALUE str = str_new(rb_cString, ptr, len);
rb_enc_associate_index(str, rb_utf8_encindex());
return str;
return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
}

VALUE
rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
{
VALUE str;

if (!enc) return rb_str_new(ptr, len);

str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
rb_enc_associate(str, enc);
return str;
return str_enc_new(rb_cString, ptr, len, enc);
}

VALUE
Expand All @@ -1076,17 +1076,13 @@ rb_str_new_cstr(const char *ptr)
VALUE
rb_usascii_str_new_cstr(const char *ptr)
{
VALUE str = rb_str_new_cstr(ptr);
ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
return str;
return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
}

VALUE
rb_utf8_str_new_cstr(const char *ptr)
{
VALUE str = rb_str_new_cstr(ptr);
rb_enc_associate_index(str, rb_utf8_encindex());
return str;
return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
}

VALUE
Expand All @@ -1109,8 +1105,7 @@ str_new_static(VALUE klass, const char *ptr, long len, int encindex)
}

if (!ptr) {
rb_encoding *enc = rb_enc_get_from_index(encindex);
str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
}
else {
RUBY_DTRACE_CREATE_HOOK(STRING, len);
Expand All @@ -1119,8 +1114,8 @@ str_new_static(VALUE klass, const char *ptr, long len, int encindex)
RSTRING(str)->as.heap.ptr = (char *)ptr;
RSTRING(str)->as.heap.aux.capa = len;
RBASIC(str)->flags |= STR_NOFREE;
rb_enc_associate_index(str, encindex);
}
rb_enc_associate_index(str, encindex);
return str;
}

Expand Down Expand Up @@ -1570,10 +1565,11 @@ str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
VALUE str;

long len = RSTRING_LEN(orig);
rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
int termlen = copy_encoding ? TERM_LEN(orig) : 1;

if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
str = str_new0(klass, RSTRING_PTR(orig), len, termlen);
str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
RUBY_ASSERT(STR_EMBED_P(str));
}
else {
Expand Down Expand Up @@ -1621,7 +1617,7 @@ str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
VALUE
rb_str_new_with_class(VALUE obj, const char *ptr, long len)
{
return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
}

static VALUE
Expand Down Expand Up @@ -2083,8 +2079,6 @@ rb_str_s_new(int argc, VALUE *argv, VALUE klass)
encoding = kwargs[0];
capacity = kwargs[1];

int termlen = 1;

if (n == 1) {
orig = StringValue(orig);
}
Expand All @@ -2100,7 +2094,6 @@ rb_str_s_new(int argc, VALUE *argv, VALUE klass)

if (!UNDEF_P(encoding)) {
enc = rb_to_encoding(encoding);
termlen = rb_enc_mbminlen(enc);
}

// If capacity is nil, we're basically just duping `orig`.
Expand Down Expand Up @@ -2131,13 +2124,9 @@ rb_str_s_new(int argc, VALUE *argv, VALUE klass)
}
}

VALUE str = str_new0(klass, NULL, capa, termlen);
VALUE str = str_enc_new(klass, NULL, capa, enc);
STR_SET_LEN(str, 0);
TERM_FILL(RSTRING_PTR(str), termlen);

if (enc) {
rb_enc_associate(str, enc);
}
TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);

if (!NIL_P(orig)) {
rb_str_buf_append(str, orig);
Expand Down Expand Up @@ -2426,7 +2415,7 @@ rb_str_plus(VALUE str1, VALUE str2)
if (len1 > LONG_MAX - len2) {
rb_raise(rb_eArgError, "string size too big");
}
str3 = str_new0(rb_cString, 0, len1+len2, termlen);
str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
ptr3 = RSTRING_PTR(str3);
memcpy(ptr3, ptr1, len1);
memcpy(ptr3+len1, ptr2, len2);
Expand Down Expand Up @@ -2521,7 +2510,7 @@ rb_str_times(VALUE str, VALUE times)

len *= RSTRING_LEN(str);
termlen = TERM_LEN(str);
str2 = str_new0(rb_cString, 0, len, termlen);
str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
ptr2 = RSTRING_PTR(str2);
if (len) {
n = RSTRING_LEN(str);
Expand Down Expand Up @@ -10887,7 +10876,7 @@ rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
rb_raise(rb_eArgError, "argument too big");
}
len += size;
res = str_new0(rb_cString, 0, len, termlen);
res = str_enc_new(rb_cString, 0, len, enc);
p = RSTRING_PTR(res);
if (flen <= 1) {
memset(p, *f, llen);
Expand Down Expand Up @@ -10923,7 +10912,7 @@ rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
}
TERM_FILL(p, termlen);
STR_SET_LEN(res, p-RSTRING_PTR(res));
rb_enc_associate(res, enc);

if (argc == 2)
cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
if (cr != ENC_CODERANGE_BROKEN)
Expand Down