merge revision(s) 65954,65955,65958: [Backport #15337]

Don't use single byte optimization on grapheme clusters Unicode Text Segmentation considers CRLF as a character. [Bug #15337] add tests using Unicode test data for grapheme clusters Add file test/ruby/enc/test_grapheme_breaks.rb to test String#each_grapheme_cluster and \X extended grapheme cluster matcher in regular expressions against test data provided by Unicode (ucd/auxiliary/GraphemeBreakTest.txt). Some lines in the data file are ignored, as follows: - Lines with a surrogate, because Ruby doesn't handle these - The case of "\r\n", because there is a bug (#15337) in the implementation remove guard against bug #15337, because it is fixed git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/branches/ruby_2_5@66073 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
ruby · Nov 28, 2018 · 29eae8b · 29eae8b
1 parent b1944e4
commit 29eae8b
Show file tree

Hide file tree

Showing 4 changed files with 98 additions and 5 deletions.
diff --git a/string.c b/string.c
@@ -8342,7 +8342,7 @@ rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
     rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str));
     const char *ptr, *end;
 
-    if (!rb_enc_unicode_p(enc) || single_byte_optimizable(str)) {
+    if (!rb_enc_unicode_p(enc)) {
 	return rb_str_length(str);
     }
 
@@ -8370,7 +8370,7 @@ rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
     rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str));
     const char *ptr, *end;
 
-    if (!rb_enc_unicode_p(enc) || single_byte_optimizable(str)) {
+    if (!rb_enc_unicode_p(enc)) {
 	return rb_str_enumerate_chars(str, ary);
     }
 

diff --git a/test/ruby/enc/test_grapheme_breaks.rb b/test/ruby/enc/test_grapheme_breaks.rb
@@ -0,0 +1,92 @@
+# frozen_string_literal: true
+# Copyright © 2018 Martin J. Dürst (duerst@it.aoyama.ac.jp)
+
+require "test/unit"
+
+class BreakTest
+  attr_reader :clusters, :string, :comment, :line_number
+
+  def initialize (line_number, data, comment)
+    @line_number = line_number
+    @comment = comment
+    @clusters = data.sub(/\A\s*÷\s*/, '')
+                    .sub(/\s*÷\s*\z/, '')
+                    .split(/\s*÷\s*/)
+                    .map do |cl|
+                      cl.split(/\s*×\s*/)
+                        .map do |ch|
+                          c = ch.to_i(16)
+                           # eliminate cases with surrogates
+                          raise ArgumentError if 0xD800 <= c and c <= 0xDFFF
+                          c.chr('UTF-8')
+                        end.join
+                    end
+    @string = @clusters.join
+  end
+end
+
+class TestGraphemeBreaksFromFile < Test::Unit::TestCase
+  UNICODE_VERSION = RbConfig::CONFIG['UNICODE_VERSION']
+  path = File.expand_path("../../../enc/unicode/data/#{UNICODE_VERSION}", __dir__)
+  UNICODE_DATA_PATH = File.directory?("#{path}/ucd/auxiliary") ? "#{path}/ucd/auxiliary" : path
+  GRAPHEME_BREAK_TEST_FILE = File.expand_path("#{UNICODE_DATA_PATH}/GraphemeBreakTest.txt", __dir__)
+
+  def self.file_available?
+    File.exist? GRAPHEME_BREAK_TEST_FILE
+  end
+
+  def test_data_files_available
+    unless TestGraphemeBreaksFromFile.file_available?
+      skip "Unicode data file GraphemeBreakTest not available in #{UNICODE_DATA_PATH}."
+    end
+  end
+end
+
+TestGraphemeBreaksFromFile.file_available? and  class TestGraphemeBreaksFromFile
+  def read_data
+    tests = []
+    IO.foreach(GRAPHEME_BREAK_TEST_FILE, encoding: Encoding::UTF_8) do |line|
+      if $. == 1 and not line.start_with?("# GraphemeBreakTest-#{UNICODE_VERSION}.txt")
+        raise "File Version Mismatch"
+      end
+      next if /\A#/.match? line
+      tests << BreakTest.new($., *line.chomp.split('#')) rescue 'whatever'
+    end
+    tests
+  end
+
+  def all_tests
+    @@tests ||= read_data
+  rescue Errno::ENOENT
+    @@tests ||= []
+  end
+
+  def test_each_grapheme_cluster
+    all_tests.each do |test|
+      expected = test.clusters
+      actual = test.string.each_grapheme_cluster.to_a
+      assert_equal expected, actual,
+        "line #{test.line_number}, expected '#{expected}', " +
+        "but got '#{actual}', comment: #{test.comment}"
+    end
+  end
+
+  def test_backslash_X
+    all_tests.each do |test|
+      clusters = test.clusters.dup
+      string = test.string.dup
+      removals = 0
+      while string.sub!(/\A\X/, '')
+        removals += 1
+        clusters.shift
+        expected = clusters.join
+        assert_equal expected, string,
+          "line #{test.line_number}, removals: #{removals}, expected '#{expected}', " +
+          "but got '#{string}', comment: #{test.comment}"
+      end
+      assert_equal expected, string,
+        "line #{test.line_number}, after last removal, expected '#{expected}', " +
+        "but got '#{string}', comment: #{test.comment}"
+    end
+  end
+end
diff --git a/test/ruby/test_string.rb b/test/ruby/test_string.rb
@@ -968,6 +968,7 @@ def test_chars
 
   def test_each_grapheme_cluster
     [
+      "\u{0D 0A}",
       "\u{20 200d}",
       "\u{600 600}",
       "\u{600 20}",

diff --git a/version.h b/version.h
@@ -1,10 +1,10 @@
 #define RUBY_VERSION "2.5.4"
-#define RUBY_RELEASE_DATE "2018-11-15"
-#define RUBY_PATCHLEVEL 112
+#define RUBY_RELEASE_DATE "2018-11-28"
+#define RUBY_PATCHLEVEL 113
 
 #define RUBY_RELEASE_YEAR 2018
 #define RUBY_RELEASE_MONTH 11
-#define RUBY_RELEASE_DAY 15
+#define RUBY_RELEASE_DAY 28
 
 #include "ruby/version.h"