Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
merge revision(s) 65954,65955,65958: [Backport #15337]
Don't use single byte optimization on grapheme clusters Unicode Text Segmentation considers CRLF as a character. [Bug #15337] add tests using Unicode test data for grapheme clusters Add file test/ruby/enc/test_grapheme_breaks.rb to test String#each_grapheme_cluster and \X extended grapheme cluster matcher in regular expressions against test data provided by Unicode (ucd/auxiliary/GraphemeBreakTest.txt). Some lines in the data file are ignored, as follows: - Lines with a surrogate, because Ruby doesn't handle these - The case of "\r\n", because there is a bug (#15337) in the implementation remove guard against bug #15337, because it is fixed git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/branches/ruby_2_5@66073 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
- Loading branch information
Showing
4 changed files
with
98 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
# frozen_string_literal: true | ||
# Copyright © 2018 Martin J. Dürst (duerst@it.aoyama.ac.jp) | ||
|
||
require "test/unit" | ||
|
||
class BreakTest | ||
attr_reader :clusters, :string, :comment, :line_number | ||
|
||
def initialize (line_number, data, comment) | ||
@line_number = line_number | ||
@comment = comment | ||
@clusters = data.sub(/\A\s*÷\s*/, '') | ||
.sub(/\s*÷\s*\z/, '') | ||
.split(/\s*÷\s*/) | ||
.map do |cl| | ||
cl.split(/\s*×\s*/) | ||
.map do |ch| | ||
c = ch.to_i(16) | ||
# eliminate cases with surrogates | ||
raise ArgumentError if 0xD800 <= c and c <= 0xDFFF | ||
c.chr('UTF-8') | ||
end.join | ||
end | ||
@string = @clusters.join | ||
end | ||
end | ||
|
||
class TestGraphemeBreaksFromFile < Test::Unit::TestCase | ||
UNICODE_VERSION = RbConfig::CONFIG['UNICODE_VERSION'] | ||
path = File.expand_path("../../../enc/unicode/data/#{UNICODE_VERSION}", __dir__) | ||
UNICODE_DATA_PATH = File.directory?("#{path}/ucd/auxiliary") ? "#{path}/ucd/auxiliary" : path | ||
GRAPHEME_BREAK_TEST_FILE = File.expand_path("#{UNICODE_DATA_PATH}/GraphemeBreakTest.txt", __dir__) | ||
|
||
def self.file_available? | ||
File.exist? GRAPHEME_BREAK_TEST_FILE | ||
end | ||
|
||
def test_data_files_available | ||
unless TestGraphemeBreaksFromFile.file_available? | ||
skip "Unicode data file GraphemeBreakTest not available in #{UNICODE_DATA_PATH}." | ||
end | ||
end | ||
end | ||
|
||
TestGraphemeBreaksFromFile.file_available? and class TestGraphemeBreaksFromFile | ||
def read_data | ||
tests = [] | ||
IO.foreach(GRAPHEME_BREAK_TEST_FILE, encoding: Encoding::UTF_8) do |line| | ||
if $. == 1 and not line.start_with?("# GraphemeBreakTest-#{UNICODE_VERSION}.txt") | ||
raise "File Version Mismatch" | ||
end | ||
next if /\A#/.match? line | ||
tests << BreakTest.new($., *line.chomp.split('#')) rescue 'whatever' | ||
end | ||
tests | ||
end | ||
|
||
def all_tests | ||
@@tests ||= read_data | ||
rescue Errno::ENOENT | ||
@@tests ||= [] | ||
end | ||
|
||
def test_each_grapheme_cluster | ||
all_tests.each do |test| | ||
expected = test.clusters | ||
actual = test.string.each_grapheme_cluster.to_a | ||
assert_equal expected, actual, | ||
"line #{test.line_number}, expected '#{expected}', " + | ||
"but got '#{actual}', comment: #{test.comment}" | ||
end | ||
end | ||
|
||
def test_backslash_X | ||
all_tests.each do |test| | ||
clusters = test.clusters.dup | ||
string = test.string.dup | ||
removals = 0 | ||
while string.sub!(/\A\X/, '') | ||
removals += 1 | ||
clusters.shift | ||
expected = clusters.join | ||
assert_equal expected, string, | ||
"line #{test.line_number}, removals: #{removals}, expected '#{expected}', " + | ||
"but got '#{string}', comment: #{test.comment}" | ||
end | ||
assert_equal expected, string, | ||
"line #{test.line_number}, after last removal, expected '#{expected}', " + | ||
"but got '#{string}', comment: #{test.comment}" | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters