Skip to content

Commit

Permalink
Add proper unicode support to Regexp
Browse files Browse the repository at this point in the history
This includes the ability to recompile a Regexp if $KCODE changes.
  • Loading branch information
Evan Phoenix committed Feb 18, 2010
1 parent db94b58 commit 72341f2
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 7 deletions.
82 changes: 76 additions & 6 deletions vm/builtin/regexp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,20 @@ namespace rubinius {
return r;
}

static OnigEncoding current_encoding(STATE) {
switch(state->shared.kcode_page()) {
default:
case kcode::eAscii:
return ONIG_ENCODING_ASCII;
case kcode::eEUC:
return ONIG_ENCODING_EUC_JP;
case kcode::eSJIS:
return ONIG_ENCODING_SJIS;
case kcode::eUTF8:
return ONIG_ENCODING_UTF8;
}
}

int get_kcode_from_enc(OnigEncoding enc) {
int r;

Expand Down Expand Up @@ -101,6 +115,7 @@ namespace rubinius {
Regexp* o_reg = state->new_object<Regexp>(G(regexp));

o_reg->onig_data = NULL;
o_reg->forced_encoding_ = false;

return o_reg;
}
Expand Down Expand Up @@ -173,6 +188,48 @@ namespace rubinius {
onig_free(old_reg);
}

void Regexp::maybe_recompile(STATE) {
const UChar *pat;
const UChar *end;
OnigEncoding enc;
OnigErrorInfo err_info;
int err;

if(forced_encoding_) return;

enc = current_encoding(state);
if(enc == onig_data->enc) return;

pat = (UChar*)source()->c_str();
end = pat + source()->size();

int options = onig_data->options;
OnigEncoding orig_enc = onig_data->enc;

err = onig_new(&this->onig_data, pat, end, options,
enc, ONIG_SYNTAX_RUBY, &err_info);

// If it doesn't work out, then abort and reset the encoding back
// and say that it's forced.
if(err != ONIG_NORMAL) {

err = onig_new(&this->onig_data, pat, end, options,
orig_enc, ONIG_SYNTAX_RUBY, &err_info);

// Ok, wtf. Well, no way to proceed now.
if(err != ONIG_NORMAL) {
OnigUChar buf[1024];
onig_error_code_to_str((UChar*)buf, err, &err_info);
std::cout << "Fatal ONIG error: " << buf << "\n";
assert(err == ONIG_NORMAL);
}

forced_encoding_ = true;
}

make_managed(state);
}

/*
* This is a primitive so #initialize_copy can work.
*/
Expand All @@ -190,9 +247,17 @@ namespace rubinius {

opts = options->to_native();
kcode = opts & KCODE_MASK;
enc = get_enc_from_kcode(kcode);
opts &= OPTION_MASK;

if(kcode == 0) {
enc = current_encoding(state);
} else {
// Don't attempt to fix the encoding later, it's been specified by the
// user.
enc = get_enc_from_kcode(kcode);
forced_encoding_ = true;
}

err = onig_new(&this->onig_data, pat, end, opts, enc, ONIG_SYNTAX_RUBY, &err_info);

if(err != ONIG_NORMAL) {
Expand Down Expand Up @@ -234,15 +299,17 @@ namespace rubinius {
}

Object* Regexp::options(STATE) {
OnigEncoding enc;
OnigOptionType option;
regex_t* reg;

reg = onig_data;
option = onig_get_options(reg);
enc = onig_get_encoding(reg);

return Integer::from(state, ((int)(option & OPTION_MASK) | get_kcode_from_enc(enc)));
int result = ((int)onig_get_options(reg) & OPTION_MASK);

if(forced_encoding_) {
result |= get_kcode_from_enc(onig_get_encoding(reg));
}

return Integer::from(state, result);
}

static Tuple* _md_region_to_tuple(STATE, OnigRegion *region, int max) {
Expand Down Expand Up @@ -278,6 +345,8 @@ namespace rubinius {
OnigRegion *region;
Object* md;

maybe_recompile(state);

region = onig_region_new();

max = string->size();
Expand Down Expand Up @@ -329,6 +398,7 @@ namespace rubinius {
OnigRegion *region;
Object* md = Qnil;

maybe_recompile(state);
region = onig_region_new();

max = string->size();
Expand Down
2 changes: 2 additions & 0 deletions vm/builtin/regexp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ namespace rubinius {
String* source_; // slot
LookupTable* names_; // slot
regex_t* onig_data;
bool forced_encoding_;

public:
/* accessors */
Expand All @@ -41,6 +42,7 @@ namespace rubinius {

// Ruby.primitive :regexp_initialize
Regexp* initialize(STATE, String* pattern, Integer* options, Object* lang);
void maybe_recompile(STATE);

// Ruby.primitive :regexp_options
Object* options(STATE);
Expand Down
2 changes: 1 addition & 1 deletion vm/test/test_regexp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ class TestRegexp : public CxxTest::TestSuite, public VMTest {
Regexp* re = Regexp::create(state);
re->initialize(state, pat, Fixnum::from(0), Qnil);

TS_ASSERT_EQUALS(as<Integer>(re->options(state))->to_native(), 16);
TS_ASSERT_EQUALS(as<Integer>(re->options(state))->to_native(), 0);
}

void test_match_region() {
Expand Down

0 comments on commit 72341f2

Please sign in to comment.