From 9fc40d2b26fad25960eff6dd4d35d6592f6faaef Mon Sep 17 00:00:00 2001 From: Matt Boldt Date: Tue, 28 Nov 2023 09:19:25 -0600 Subject: [PATCH] [ruby/prism] Add MacJapanese encoding MacJapanese (also aliased as MacJapan) is a modified Shift_JIS encoding, but is implemented identically in Ruby https://github.com/ruby/prism/commit/9e0a097699 --- lib/prism/prism.gemspec | 1 + prism/enc/pm_encoding.h | 1 + prism/enc/pm_mac_japanese.c | 57 +++++++++++++++++++++++++++++++++++++ prism/prism.c | 2 ++ test/prism/encoding_test.rb | 1 + 5 files changed, 62 insertions(+) create mode 100644 prism/enc/pm_mac_japanese.c diff --git a/lib/prism/prism.gemspec b/lib/prism/prism.gemspec index 80cfa8ab7617e6..381ecfea5bf710 100644 --- a/lib/prism/prism.gemspec +++ b/lib/prism/prism.gemspec @@ -91,6 +91,7 @@ Gem::Specification.new do |spec| "src/enc/pm_cp950.c", "src/enc/pm_euc_jp.c", "src/enc/pm_gbk.c", + "src/enc/pm_mac_japanese.c", "src/enc/pm_shift_jis.c", "src/enc/pm_tables.c", "src/enc/pm_unicode.c", diff --git a/prism/enc/pm_encoding.h b/prism/enc/pm_encoding.h index 5b79902389f5c1..797029365c5c4c 100644 --- a/prism/enc/pm_encoding.h +++ b/prism/enc/pm_encoding.h @@ -206,6 +206,7 @@ extern pm_encoding_t pm_encoding_mac_croatian; extern pm_encoding_t pm_encoding_mac_cyrillic; extern pm_encoding_t pm_encoding_mac_greek; extern pm_encoding_t pm_encoding_mac_iceland; +extern pm_encoding_t pm_encoding_mac_japanese; extern pm_encoding_t pm_encoding_mac_roman; extern pm_encoding_t pm_encoding_mac_romania; extern pm_encoding_t pm_encoding_mac_thai; diff --git a/prism/enc/pm_mac_japanese.c b/prism/enc/pm_mac_japanese.c new file mode 100644 index 00000000000000..a5185f0e5535af --- /dev/null +++ b/prism/enc/pm_mac_japanese.c @@ -0,0 +1,57 @@ +#include "prism/enc/pm_encoding.h" + +static size_t +pm_encoding_mac_japanese_char_width(const uint8_t *b, ptrdiff_t n) { + // These are the single byte characters. + if (*b < 0x80 || (*b >= 0xA1 && *b <= 0xDF)) { + return 1; + } + + // These are the double byte characters. + if ( + (n > 1) && + ((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) && + (b[1] >= 0x40 && b[1] <= 0xFC) + ) { + return 2; + } + + return 0; +} + +static size_t +pm_encoding_mac_japanese_alpha_char(const uint8_t *b, ptrdiff_t n) { + if (pm_encoding_mac_japanese_char_width(b, n) == 1) { + return pm_encoding_ascii_alpha_char(b, n); + } else { + return 0; + } +} + +static size_t +pm_encoding_mac_japanese_alnum_char(const uint8_t *b, ptrdiff_t n) { + if (pm_encoding_mac_japanese_char_width(b, n) == 1) { + return pm_encoding_ascii_alnum_char(b, n); + } else { + return 0; + } +} + +static bool +pm_encoding_mac_japanese_isupper_char(const uint8_t *b, ptrdiff_t n) { + if (pm_encoding_mac_japanese_char_width(b, n) == 1) { + return pm_encoding_ascii_isupper_char(b, n); + } else { + return 0; + } +} + +/** MacJapanese encoding */ +pm_encoding_t pm_encoding_mac_japanese = { + .name = "MacJapanese", + .char_width = pm_encoding_mac_japanese_char_width, + .alnum_char = pm_encoding_mac_japanese_alnum_char, + .alpha_char = pm_encoding_mac_japanese_alpha_char, + .isupper_char = pm_encoding_mac_japanese_isupper_char, + .multibyte = true +}; diff --git a/prism/prism.c b/prism/prism.c index 960b652db8a397..154d8ea6b26eee 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -6303,6 +6303,8 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star ENCODING1("macCyrillic", pm_encoding_mac_cyrillic); ENCODING1("macGreek", pm_encoding_mac_greek); ENCODING1("macIceland", pm_encoding_mac_iceland); + ENCODING1("MacJapanese", pm_encoding_mac_japanese); + ENCODING1("MacJapan", pm_encoding_mac_japanese); ENCODING1("macRoman", pm_encoding_mac_roman); ENCODING1("macRomania", pm_encoding_mac_romania); ENCODING1("macThai", pm_encoding_mac_thai); diff --git a/test/prism/encoding_test.rb b/test/prism/encoding_test.rb index 28992fcf1b358b..76162bec1ec22f 100644 --- a/test/prism/encoding_test.rb +++ b/test/prism/encoding_test.rb @@ -72,6 +72,7 @@ class EncodingTest < TestCase Encoding::CP950 => 0x00...0x10000, Encoding::CP51932 => 0x00...0x10000, Encoding::GBK => 0x00...0x10000, + Encoding::MACJAPANESE => 0x00...0x10000, Encoding::Shift_JIS => 0x00...0x10000, Encoding::Windows_31J => 0x00...0x10000 }