-
Notifications
You must be signed in to change notification settings - Fork 83
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve performance of Reline::Unicode.get_mbchar_width
#632
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,18 @@ if ARGV.empty? | |
exit 1 | ||
end | ||
|
||
def unicode_width(type, category) | ||
return 0 if category == 'Mn' # Nonspacing Mark | ||
case type | ||
when 'F', 'W' # Fullwidth, Wide | ||
2 | ||
when 'H', 'Na', 'N' # Halfwidth, Narrow, Neutral | ||
1 | ||
when 'A' # Ambiguous | ||
-1 | ||
end | ||
end | ||
|
||
open(ARGV.first, 'rt') do |f| | ||
if m = f.gets.match(/^# EastAsianWidth-(\d+\.\d+\.\d+)\.txt/) | ||
unicode_version = m[1] | ||
|
@@ -13,66 +25,31 @@ open(ARGV.first, 'rt') do |f| | |
unicode_version = nil | ||
end | ||
|
||
list = [] | ||
widths = [] | ||
f.each_line do |line| | ||
next unless m = line.match(/^(\h+)(?:\.\.(\h+))?\s*;\s*(\w+)\s+#.+/) | ||
next unless /^(?<first>\h+)(?:\.\.(?<last>\h+))?\s*;\s*(?<type>\w+)\s+# +(?<category>[^ ]+)/ =~ line | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 📝
https://www.unicode.org/Public/15.1.0/ucd/EastAsianWidth.txt |
||
|
||
first = m[1].to_i(16) | ||
last = m[2]&.to_i(16) || first | ||
type = m[3].to_sym | ||
if !list.empty? and (list.last[:range].last + 1) == first and list.last[:type] == type | ||
list.last[:range] = (list.last[:range].first..last) | ||
else | ||
# [\u{D800}-\u{DFFF}] cause error. | ||
unless ((0xD800..0xDFFF).to_a & (first..last).to_a).empty? | ||
unless (first..0xD7FF).to_a.empty? | ||
list << { | ||
range: (first..0xD7FF), | ||
type: type.to_sym | ||
} | ||
end | ||
unless (0xE000..last).to_a.empty? | ||
list << { | ||
range: (first..0xD7FF), | ||
type: type.to_sym | ||
} | ||
end | ||
else | ||
list << { | ||
range: (first..last), | ||
type: type.to_sym | ||
} | ||
end | ||
end | ||
range = first.to_i(16)..(last || first).to_i(16) | ||
widths.fill(unicode_width(type, category), range) | ||
end | ||
grouped = list.group_by { |item| item[:type] }.map { |item| [item.first, item.last.map { |row| row[:range] }] }.to_h | ||
grouped = %i{F H W Na A N}.map { |type| [type, grouped[type]] } | ||
puts <<EOH | ||
class Reline::Unicode::EastAsianWidth | ||
# This is based on EastAsianWidth.txt | ||
# UNICODE_VERSION = #{unicode_version ? "'#{unicode_version}'" : 'nil'} | ||
|
||
EOH | ||
puts grouped.map { |item| | ||
type, ranges = item | ||
output = " # %s\n" % | ||
case type | ||
when :F then 'Fullwidth' | ||
when :H then 'Halfwidth' | ||
when :W then 'Wide' | ||
when :Na then 'Narrow' | ||
when :A then 'Ambiguous' | ||
when :N then 'Neutral' | ||
end | ||
output += " TYPE_%s = /^[\#{ %%W(\n" % type.upcase | ||
output += ranges.map { |range| | ||
if range.first == range.last | ||
' \u{%04X}' % range.first | ||
else | ||
' \u{%04X}-\u{%04X}' % [range.first, range.last] | ||
end | ||
}.join("\n") | ||
output += "\n ).join }]/\n" | ||
}.join("\n") | ||
puts 'end' | ||
# EscapedPairs | ||
[*0x00..0x1F, 0x7F].each { |ord| widths[ord] = 2 } | ||
# printable ASCII chars | ||
(0x20..0x7E).each { |ord| widths[ord] = 1 } | ||
|
||
chunks = widths.each_with_index.chunk { |width, _idx| width || 1 } | ||
chunk_last_ords = chunks.map { |width, chunk| [chunk.last.last, width] } | ||
chunk_last_ords << [0x7fffffff, 1] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added this value(max value of int32) as an alternative of infinity. This way, we don't need to test There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, I see. I agree that using |
||
|
||
puts <<~EOH | ||
class Reline::Unicode::EastAsianWidth | ||
# This is based on EastAsianWidth.txt | ||
# UNICODE_VERSION = #{unicode_version ? "'#{unicode_version}'" : 'nil'} | ||
|
||
CHUNK_LAST, CHUNK_WIDTH = [ | ||
#{chunk_last_ords.map { |ord, width| " [0x#{ord.to_s(16)}, #{width}]" }.join(",\n")} | ||
].transpose.map(&:freeze) | ||
end | ||
EOH | ||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -56,51 +56,26 @@ def self.escape_for_print(str) | |
|
||
require 'reline/unicode/east_asian_width' | ||
|
||
HalfwidthDakutenHandakuten = /[\u{FF9E}\u{FF9F}]/ | ||
|
||
MBCharWidthRE = / | ||
(?<width_2_1> | ||
[#{ EscapedChars.map {|c| "\\x%02x" % c.ord }.join }] (?# ^ + char, such as ^M, ^H, ^[, ...) | ||
) | ||
| (?<width_3>^\u{2E3B}) (?# THREE-EM DASH) | ||
| (?<width_0>^\p{M}) | ||
| (?<width_2_2> | ||
#{ EastAsianWidth::TYPE_F } | ||
| #{ EastAsianWidth::TYPE_W } | ||
) | ||
| (?<width_1> | ||
#{ EastAsianWidth::TYPE_H } | ||
| #{ EastAsianWidth::TYPE_NA } | ||
| #{ EastAsianWidth::TYPE_N } | ||
)(?!#{ HalfwidthDakutenHandakuten }) | ||
| (?<width_2_3> | ||
(?: #{ EastAsianWidth::TYPE_H } | ||
| #{ EastAsianWidth::TYPE_NA } | ||
| #{ EastAsianWidth::TYPE_N }) | ||
#{ HalfwidthDakutenHandakuten } | ||
) | ||
| (?<ambiguous_width> | ||
#{EastAsianWidth::TYPE_A} | ||
) | ||
/x | ||
|
||
def self.get_mbchar_width(mbchar) | ||
ord = mbchar.ord | ||
if (0x00 <= ord and ord <= 0x1F) # in EscapedPairs | ||
if ord <= 0x1F # in EscapedPairs | ||
return 2 | ||
elsif (0x20 <= ord and ord <= 0x7E) # printable ASCII chars | ||
elsif ord <= 0x7E # printable ASCII chars | ||
return 1 | ||
end | ||
m = mbchar.encode(Encoding::UTF_8).match(MBCharWidthRE) | ||
case | ||
when m.nil? then 1 # TODO should be U+FFFD � REPLACEMENT CHARACTER | ||
when m[:width_2_1], m[:width_2_2], m[:width_2_3] then 2 | ||
when m[:width_3] then 3 | ||
when m[:width_0] then 0 | ||
when m[:width_1] then 1 | ||
when m[:ambiguous_width] then Reline.ambiguous_width | ||
utf8_mbchar = mbchar.encode(Encoding::UTF_8) | ||
ord = utf8_mbchar.ord | ||
chunk_index = EastAsianWidth::CHUNK_LAST.bsearch_index { |o| ord <= o } | ||
size = EastAsianWidth::CHUNK_WIDTH[chunk_index] | ||
if size == -1 | ||
Reline.ambiguous_width | ||
elsif size == 1 && utf8_mbchar.size >= 2 | ||
second_char_ord = utf8_mbchar[1].ord | ||
# Halfwidth Dakuten Handakuten | ||
# Only these two character has Letter Modifier category and can be combined in a single grapheme cluster | ||
(second_char_ord == 0xFF9E || second_char_ord == 0xFF9F) ? 2 : 1 | ||
Comment on lines
+72
to
+76
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 📝 This is a special case specifically for when there is a character with width 1 before U+FF9E or U+FF9F. In other words, it allows the width to be calculated correctly for something like ‘ガ’. However, it cannot calculate the width correctly for ‘が’. |
||
else | ||
nil | ||
size | ||
end | ||
end | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there any reason why the Nonspacing Mark should be set to 0?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In the old implementation, Mark(
Mn (nonspacing mark) + Mc (spacing combining mark) + Me (enclosing mark)
) are all set to 0.I think this is just a simple mistake of
\p{Mn}
because Mark is not always zero width.I think there are two choices and I choose the latter one.
Actual width calculated by the script below are:
Mn is not zero-width in some terminal emulators, but I prefer not changing the original intention in this pull request except bug.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for your explanation. That makes sense to me now.