Skip to content

Commit

Permalink
merge revision(s) 65954,65955,65958: [Backport #15337]
Browse files Browse the repository at this point in the history
	Don't use single byte optimization on grapheme clusters

	Unicode Text Segmentation considers CRLF as a character. [Bug #15337]

	add tests using Unicode test data for grapheme clusters

	Add file test/ruby/enc/test_grapheme_breaks.rb to test String#each_grapheme_cluster
	and \X extended grapheme cluster matcher in regular expressions against test data
	provided by Unicode (ucd/auxiliary/GraphemeBreakTest.txt).

	Some lines in the data file are ignored, as follows:
	- Lines with a surrogate, because Ruby doesn't handle these
	- The case of "\r\n", because there is a bug (#15337) in the implementation

	remove guard against bug #15337, because it is fixed

git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/branches/ruby_2_5@66073 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
  • Loading branch information
nagachika committed Nov 28, 2018
1 parent b1944e4 commit 29eae8b
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 5 deletions.
4 changes: 2 additions & 2 deletions string.c
Expand Up @@ -8342,7 +8342,7 @@ rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str));
const char *ptr, *end;

if (!rb_enc_unicode_p(enc) || single_byte_optimizable(str)) {
if (!rb_enc_unicode_p(enc)) {
return rb_str_length(str);
}

Expand Down Expand Up @@ -8370,7 +8370,7 @@ rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str));
const char *ptr, *end;

if (!rb_enc_unicode_p(enc) || single_byte_optimizable(str)) {
if (!rb_enc_unicode_p(enc)) {
return rb_str_enumerate_chars(str, ary);
}

Expand Down
92 changes: 92 additions & 0 deletions test/ruby/enc/test_grapheme_breaks.rb
@@ -0,0 +1,92 @@
# frozen_string_literal: true
# Copyright © 2018 Martin J. Dürst (duerst@it.aoyama.ac.jp)

require "test/unit"

class BreakTest
attr_reader :clusters, :string, :comment, :line_number

def initialize (line_number, data, comment)
@line_number = line_number
@comment = comment
@clusters = data.sub(/\A\s\s*/, '')
.sub(/\s\s*\z/, '')
.split(/\s\s*/)
.map do |cl|
cl.split(/\s\s*/)
.map do |ch|
c = ch.to_i(16)
# eliminate cases with surrogates
raise ArgumentError if 0xD800 <= c and c <= 0xDFFF
c.chr('UTF-8')
end.join
end
@string = @clusters.join
end
end

class TestGraphemeBreaksFromFile < Test::Unit::TestCase
UNICODE_VERSION = RbConfig::CONFIG['UNICODE_VERSION']
path = File.expand_path("../../../enc/unicode/data/#{UNICODE_VERSION}", __dir__)
UNICODE_DATA_PATH = File.directory?("#{path}/ucd/auxiliary") ? "#{path}/ucd/auxiliary" : path
GRAPHEME_BREAK_TEST_FILE = File.expand_path("#{UNICODE_DATA_PATH}/GraphemeBreakTest.txt", __dir__)

def self.file_available?
File.exist? GRAPHEME_BREAK_TEST_FILE
end

def test_data_files_available
unless TestGraphemeBreaksFromFile.file_available?
skip "Unicode data file GraphemeBreakTest not available in #{UNICODE_DATA_PATH}."
end
end
end

TestGraphemeBreaksFromFile.file_available? and class TestGraphemeBreaksFromFile
def read_data
tests = []
IO.foreach(GRAPHEME_BREAK_TEST_FILE, encoding: Encoding::UTF_8) do |line|
if $. == 1 and not line.start_with?("# GraphemeBreakTest-#{UNICODE_VERSION}.txt")
raise "File Version Mismatch"
end
next if /\A#/.match? line
tests << BreakTest.new($., *line.chomp.split('#')) rescue 'whatever'
end
tests
end

def all_tests
@@tests ||= read_data
rescue Errno::ENOENT
@@tests ||= []
end

def test_each_grapheme_cluster
all_tests.each do |test|
expected = test.clusters
actual = test.string.each_grapheme_cluster.to_a
assert_equal expected, actual,
"line #{test.line_number}, expected '#{expected}', " +
"but got '#{actual}', comment: #{test.comment}"
end
end

def test_backslash_X
all_tests.each do |test|
clusters = test.clusters.dup
string = test.string.dup
removals = 0
while string.sub!(/\A\X/, '')
removals += 1
clusters.shift
expected = clusters.join
assert_equal expected, string,
"line #{test.line_number}, removals: #{removals}, expected '#{expected}', " +
"but got '#{string}', comment: #{test.comment}"
end
assert_equal expected, string,
"line #{test.line_number}, after last removal, expected '#{expected}', " +
"but got '#{string}', comment: #{test.comment}"
end
end
end
1 change: 1 addition & 0 deletions test/ruby/test_string.rb
Expand Up @@ -968,6 +968,7 @@ def test_chars

def test_each_grapheme_cluster
[
"\u{0D 0A}",
"\u{20 200d}",
"\u{600 600}",
"\u{600 20}",
Expand Down
6 changes: 3 additions & 3 deletions version.h
@@ -1,10 +1,10 @@
#define RUBY_VERSION "2.5.4"
#define RUBY_RELEASE_DATE "2018-11-15"
#define RUBY_PATCHLEVEL 112
#define RUBY_RELEASE_DATE "2018-11-28"
#define RUBY_PATCHLEVEL 113

#define RUBY_RELEASE_YEAR 2018
#define RUBY_RELEASE_MONTH 11
#define RUBY_RELEASE_DAY 15
#define RUBY_RELEASE_DAY 28

#include "ruby/version.h"

Expand Down

0 comments on commit 29eae8b

Please sign in to comment.