From 78ef23e47b1acc5f19fb8c1ee9664dadb0ec8d8e Mon Sep 17 00:00:00 2001 From: Eric Hodel Date: Sat, 8 Dec 2012 13:06:12 -0800 Subject: [PATCH] Parse unicode characters above \uFFFF MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The regular expression matching identifiers was incomplete for unicode characters. Now 𝖒 can be parsed in an identifier. Ruby Bug #7524 --- lib/rdoc/ruby_lex.rb | 2 +- test/test_rdoc_ruby_lex.rb | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/lib/rdoc/ruby_lex.rb b/lib/rdoc/ruby_lex.rb index 845569b0bc..313e69ea47 100644 --- a/lib/rdoc/ruby_lex.rb +++ b/lib/rdoc/ruby_lex.rb @@ -857,7 +857,7 @@ def identify_gvar end IDENT_RE = if defined? Encoding then - /[\w\u0080-\uFFFF]/u + eval '/[\w\u{0080}-\u{FFFFF}]/u' # 1.8 can't parse \u{} else /[\w\x80-\xFF]/ end diff --git a/test/test_rdoc_ruby_lex.rb b/test/test_rdoc_ruby_lex.rb index 1dc11e95a3..dfa350e018 100644 --- a/test/test_rdoc_ruby_lex.rb +++ b/test/test_rdoc_ruby_lex.rb @@ -1,3 +1,5 @@ +# coding: UTF-8 + require 'rdoc/test_case' class TestRDocRubyLex < RDoc::TestCase @@ -133,6 +135,14 @@ def test_class_tokenize_heredoc_percent_N assert_equal expected, tokens end + def test_class_tokenize_identifier_high_unicode + tokens = RDoc::RubyLex.tokenize '𝖒', nil + + expected = @TK::TkIDENTIFIER.new(0, 1, 0, '𝖒') + + assert_equal expected, tokens.first + end + def test_class_tokenize_percent_1 tokens = RDoc::RubyLex.tokenize 'v%10==10', nil