Calculate mbchar width with bsearch (#632)

tompng · web-flow · commit 0851e9364040 · 2024-08-30T02:34:24.000+09:00
diff --git a/bin/generate_east_asian_width b/bin/generate_east_asian_width
@@ -5,6 +5,18 @@ if ARGV.empty?
   exit 1
 end
 
+def unicode_width(type, category)
+  return 0 if category == 'Mn' # Nonspacing Mark
+  case type
+  when 'F', 'W' # Fullwidth, Wide
+    2
+  when 'H', 'Na', 'N' # Halfwidth, Narrow, Neutral
+    1
+  when 'A' # Ambiguous
+    -1
+  end
+end
+
 open(ARGV.first, 'rt') do |f|
   if m = f.gets.match(/^# EastAsianWidth-(\d+\.\d+\.\d+)\.txt/)
     unicode_version = m[1]
@@ -13,66 +25,31 @@ open(ARGV.first, 'rt') do |f|
     unicode_version = nil
   end
 
-  list = []
+  widths = []
   f.each_line do |line|
-    next unless m = line.match(/^(\h+)(?:\.\.(\h+))?\s*;\s*(\w+)\s+#.+/)
+    next unless /^(?<first>\h+)(?:\.\.(?<last>\h+))?\s*;\s*(?<type>\w+)\s+# +(?<category>[^ ]+)/ =~ line
 
-    first = m[1].to_i(16)
-    last = m[2]&.to_i(16) || first
-    type = m[3].to_sym
-    if !list.empty? and (list.last[:range].last + 1) == first and list.last[:type] == type
-      list.last[:range] = (list.last[:range].first..last)
-    else
-      # [\u{D800}-\u{DFFF}] cause error.
-      unless ((0xD800..0xDFFF).to_a & (first..last).to_a).empty?
-        unless (first..0xD7FF).to_a.empty?
-          list << {
-            range: (first..0xD7FF),
-            type: type.to_sym
-          }
-        end
-        unless (0xE000..last).to_a.empty?
-          list << {
-            range: (first..0xD7FF),
-            type: type.to_sym
-          }
-        end
-      else
-        list << {
-          range: (first..last),
-          type: type.to_sym
-        }
-      end
-    end
+    range = first.to_i(16)..(last || first).to_i(16)
+    widths.fill(unicode_width(type, category), range)
   end
-  grouped = list.group_by { |item| item[:type] }.map { |item| [item.first, item.last.map { |row| row[:range] }] }.to_h
-  grouped = %i{F H W Na A N}.map { |type| [type, grouped[type]] }
-  puts <<EOH
-class Reline::Unicode::EastAsianWidth
-  # This is based on EastAsianWidth.txt
-  # UNICODE_VERSION = #{unicode_version ? "'#{unicode_version}'" : 'nil'}
 
-EOH
-  puts grouped.map { |item|
-    type, ranges = item
-    output =  "  # %s\n" %
-      case type
-      when :F  then 'Fullwidth'
-      when :H  then 'Halfwidth'
-      when :W  then 'Wide'
-      when :Na then 'Narrow'
-      when :A  then 'Ambiguous'
-      when :N  then 'Neutral'
-      end
-    output += "  TYPE_%s = /^[\#{ %%W(\n" % type.upcase
-    output += ranges.map { |range|
-      if range.first == range.last
-        '    \u{%04X}' % range.first
-      else
-        '    \u{%04X}-\u{%04X}' % [range.first, range.last]
-      end
-    }.join("\n")
-    output += "\n  ).join }]/\n"
-  }.join("\n")
-  puts 'end'
+  # EscapedPairs
+  [*0x00..0x1F, 0x7F].each { |ord| widths[ord] = 2 }
+  # printable ASCII chars
+  (0x20..0x7E).each { |ord| widths[ord] = 1 }
+
+  chunks = widths.each_with_index.chunk { |width, _idx| width || 1 }
+  chunk_last_ords = chunks.map { |width, chunk| [chunk.last.last, width] }
+  chunk_last_ords << [0x7fffffff, 1]
+
+  puts <<~EOH
+    class Reline::Unicode::EastAsianWidth
+      # This is based on EastAsianWidth.txt
+      # UNICODE_VERSION = #{unicode_version ? "'#{unicode_version}'" : 'nil'}
+
+      CHUNK_LAST, CHUNK_WIDTH = [
+    #{chunk_last_ords.map { |ord, width| "    [0x#{ord.to_s(16)}, #{width}]" }.join(",\n")}
+      ].transpose.map(&:freeze)
+    end
+  EOH
 end
diff --git a/lib/reline/unicode.rb b/lib/reline/unicode.rb
@@ -56,51 +56,26 @@ def self.escape_for_print(str)
 
   require 'reline/unicode/east_asian_width'
 
-  HalfwidthDakutenHandakuten = /[\u{FF9E}\u{FF9F}]/
-
-  MBCharWidthRE = /
-    (?<width_2_1>
-      [#{ EscapedChars.map {|c| "\\x%02x" % c.ord }.join }] (?# ^ + char, such as ^M, ^H, ^[, ...)
-    )
-  | (?<width_3>^\u{2E3B}) (?# THREE-EM DASH)
-  | (?<width_0>^\p{M})
-  | (?<width_2_2>
-      #{ EastAsianWidth::TYPE_F }
-    | #{ EastAsianWidth::TYPE_W }
-    )
-  | (?<width_1>
-      #{ EastAsianWidth::TYPE_H }
-    | #{ EastAsianWidth::TYPE_NA }
-    | #{ EastAsianWidth::TYPE_N }
-    )(?!#{ HalfwidthDakutenHandakuten })
-  | (?<width_2_3>
-      (?: #{ EastAsianWidth::TYPE_H }
-        | #{ EastAsianWidth::TYPE_NA }
-        | #{ EastAsianWidth::TYPE_N })
-      #{ HalfwidthDakutenHandakuten }
-    )
-  | (?<ambiguous_width>
-      #{EastAsianWidth::TYPE_A}
-    )
-  /x
-
   def self.get_mbchar_width(mbchar)
     ord = mbchar.ord
-    if (0x00 <= ord and ord <= 0x1F) # in EscapedPairs
+    if ord <= 0x1F # in EscapedPairs
       return 2
-    elsif (0x20 <= ord and ord <= 0x7E) # printable ASCII chars
+    elsif ord <= 0x7E # printable ASCII chars
       return 1
     end
-    m = mbchar.encode(Encoding::UTF_8).match(MBCharWidthRE)
-    case
-    when m.nil? then 1 # TODO should be U+FFFD � REPLACEMENT CHARACTER
-    when m[:width_2_1], m[:width_2_2], m[:width_2_3] then 2
-    when m[:width_3] then 3
-    when m[:width_0] then 0
-    when m[:width_1] then 1
-    when m[:ambiguous_width] then Reline.ambiguous_width
+    utf8_mbchar = mbchar.encode(Encoding::UTF_8)
+    ord = utf8_mbchar.ord
+    chunk_index = EastAsianWidth::CHUNK_LAST.bsearch_index { |o| ord <= o }
+    size = EastAsianWidth::CHUNK_WIDTH[chunk_index]
+    if size == -1
+      Reline.ambiguous_width
+    elsif size == 1 && utf8_mbchar.size >= 2
+      second_char_ord = utf8_mbchar[1].ord
+      # Halfwidth Dakuten Handakuten
+      # Only these two character has Letter Modifier category and can be combined in a single grapheme cluster
+      (second_char_ord == 0xFF9E || second_char_ord == 0xFF9F) ? 2 : 1
     else
-      nil
+      size
     end
   end
 
diff --git a/lib/reline/unicode/east_asian_width.rb b/lib/reline/unicode/east_asian_width.rb