Skip to content

Commit 0851e93

Browse files
authored
Calculate mbchar width with bsearch (#632)
1 parent 14784ed commit 0851e93

File tree

3 files changed

+1311
-1288
lines changed

3 files changed

+1311
-1288
lines changed

bin/generate_east_asian_width

Lines changed: 35 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,18 @@ if ARGV.empty?
55
exit 1
66
end
77

8+
def unicode_width(type, category)
9+
return 0 if category == 'Mn' # Nonspacing Mark
10+
case type
11+
when 'F', 'W' # Fullwidth, Wide
12+
2
13+
when 'H', 'Na', 'N' # Halfwidth, Narrow, Neutral
14+
1
15+
when 'A' # Ambiguous
16+
-1
17+
end
18+
end
19+
820
open(ARGV.first, 'rt') do |f|
921
if m = f.gets.match(/^# EastAsianWidth-(\d+\.\d+\.\d+)\.txt/)
1022
unicode_version = m[1]
@@ -13,66 +25,31 @@ open(ARGV.first, 'rt') do |f|
1325
unicode_version = nil
1426
end
1527

16-
list = []
28+
widths = []
1729
f.each_line do |line|
18-
next unless m = line.match(/^(\h+)(?:\.\.(\h+))?\s*;\s*(\w+)\s+#.+/)
30+
next unless /^(?<first>\h+)(?:\.\.(?<last>\h+))?\s*;\s*(?<type>\w+)\s+# +(?<category>[^ ]+)/ =~ line
1931

20-
first = m[1].to_i(16)
21-
last = m[2]&.to_i(16) || first
22-
type = m[3].to_sym
23-
if !list.empty? and (list.last[:range].last + 1) == first and list.last[:type] == type
24-
list.last[:range] = (list.last[:range].first..last)
25-
else
26-
# [\u{D800}-\u{DFFF}] cause error.
27-
unless ((0xD800..0xDFFF).to_a & (first..last).to_a).empty?
28-
unless (first..0xD7FF).to_a.empty?
29-
list << {
30-
range: (first..0xD7FF),
31-
type: type.to_sym
32-
}
33-
end
34-
unless (0xE000..last).to_a.empty?
35-
list << {
36-
range: (first..0xD7FF),
37-
type: type.to_sym
38-
}
39-
end
40-
else
41-
list << {
42-
range: (first..last),
43-
type: type.to_sym
44-
}
45-
end
46-
end
32+
range = first.to_i(16)..(last || first).to_i(16)
33+
widths.fill(unicode_width(type, category), range)
4734
end
48-
grouped = list.group_by { |item| item[:type] }.map { |item| [item.first, item.last.map { |row| row[:range] }] }.to_h
49-
grouped = %i{F H W Na A N}.map { |type| [type, grouped[type]] }
50-
puts <<EOH
51-
class Reline::Unicode::EastAsianWidth
52-
# This is based on EastAsianWidth.txt
53-
# UNICODE_VERSION = #{unicode_version ? "'#{unicode_version}'" : 'nil'}
5435

55-
EOH
56-
puts grouped.map { |item|
57-
type, ranges = item
58-
output = " # %s\n" %
59-
case type
60-
when :F then 'Fullwidth'
61-
when :H then 'Halfwidth'
62-
when :W then 'Wide'
63-
when :Na then 'Narrow'
64-
when :A then 'Ambiguous'
65-
when :N then 'Neutral'
66-
end
67-
output += " TYPE_%s = /^[\#{ %%W(\n" % type.upcase
68-
output += ranges.map { |range|
69-
if range.first == range.last
70-
' \u{%04X}' % range.first
71-
else
72-
' \u{%04X}-\u{%04X}' % [range.first, range.last]
73-
end
74-
}.join("\n")
75-
output += "\n ).join }]/\n"
76-
}.join("\n")
77-
puts 'end'
36+
# EscapedPairs
37+
[*0x00..0x1F, 0x7F].each { |ord| widths[ord] = 2 }
38+
# printable ASCII chars
39+
(0x20..0x7E).each { |ord| widths[ord] = 1 }
40+
41+
chunks = widths.each_with_index.chunk { |width, _idx| width || 1 }
42+
chunk_last_ords = chunks.map { |width, chunk| [chunk.last.last, width] }
43+
chunk_last_ords << [0x7fffffff, 1]
44+
45+
puts <<~EOH
46+
class Reline::Unicode::EastAsianWidth
47+
# This is based on EastAsianWidth.txt
48+
# UNICODE_VERSION = #{unicode_version ? "'#{unicode_version}'" : 'nil'}
49+
50+
CHUNK_LAST, CHUNK_WIDTH = [
51+
#{chunk_last_ords.map { |ord, width| " [0x#{ord.to_s(16)}, #{width}]" }.join(",\n")}
52+
].transpose.map(&:freeze)
53+
end
54+
EOH
7855
end

lib/reline/unicode.rb

Lines changed: 14 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -56,51 +56,26 @@ def self.escape_for_print(str)
5656

5757
require 'reline/unicode/east_asian_width'
5858

59-
HalfwidthDakutenHandakuten = /[\u{FF9E}\u{FF9F}]/
60-
61-
MBCharWidthRE = /
62-
(?<width_2_1>
63-
[#{ EscapedChars.map {|c| "\\x%02x" % c.ord }.join }] (?# ^ + char, such as ^M, ^H, ^[, ...)
64-
)
65-
| (?<width_3>^\u{2E3B}) (?# THREE-EM DASH)
66-
| (?<width_0>^\p{M})
67-
| (?<width_2_2>
68-
#{ EastAsianWidth::TYPE_F }
69-
| #{ EastAsianWidth::TYPE_W }
70-
)
71-
| (?<width_1>
72-
#{ EastAsianWidth::TYPE_H }
73-
| #{ EastAsianWidth::TYPE_NA }
74-
| #{ EastAsianWidth::TYPE_N }
75-
)(?!#{ HalfwidthDakutenHandakuten })
76-
| (?<width_2_3>
77-
(?: #{ EastAsianWidth::TYPE_H }
78-
| #{ EastAsianWidth::TYPE_NA }
79-
| #{ EastAsianWidth::TYPE_N })
80-
#{ HalfwidthDakutenHandakuten }
81-
)
82-
| (?<ambiguous_width>
83-
#{EastAsianWidth::TYPE_A}
84-
)
85-
/x
86-
8759
def self.get_mbchar_width(mbchar)
8860
ord = mbchar.ord
89-
if (0x00 <= ord and ord <= 0x1F) # in EscapedPairs
61+
if ord <= 0x1F # in EscapedPairs
9062
return 2
91-
elsif (0x20 <= ord and ord <= 0x7E) # printable ASCII chars
63+
elsif ord <= 0x7E # printable ASCII chars
9264
return 1
9365
end
94-
m = mbchar.encode(Encoding::UTF_8).match(MBCharWidthRE)
95-
case
96-
when m.nil? then 1 # TODO should be U+FFFD � REPLACEMENT CHARACTER
97-
when m[:width_2_1], m[:width_2_2], m[:width_2_3] then 2
98-
when m[:width_3] then 3
99-
when m[:width_0] then 0
100-
when m[:width_1] then 1
101-
when m[:ambiguous_width] then Reline.ambiguous_width
66+
utf8_mbchar = mbchar.encode(Encoding::UTF_8)
67+
ord = utf8_mbchar.ord
68+
chunk_index = EastAsianWidth::CHUNK_LAST.bsearch_index { |o| ord <= o }
69+
size = EastAsianWidth::CHUNK_WIDTH[chunk_index]
70+
if size == -1
71+
Reline.ambiguous_width
72+
elsif size == 1 && utf8_mbchar.size >= 2
73+
second_char_ord = utf8_mbchar[1].ord
74+
# Halfwidth Dakuten Handakuten
75+
# Only these two character has Letter Modifier category and can be combined in a single grapheme cluster
76+
(second_char_ord == 0xFF9E || second_char_ord == 0xFF9F) ? 2 : 1
10277
else
103-
nil
78+
size
10479
end
10580
end
10681

0 commit comments

Comments
 (0)