Skip to content

Commit a362535

Browse files
committed
Add windows-1250 encoding
1 parent c787d2e commit a362535

File tree

6 files changed

+138
-78
lines changed

6 files changed

+138
-78
lines changed

bin/encodings

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,9 @@ when "koi8-r" then table(Encoding::KOI8_R)
6969
when "shift_jis" then lists("shift_jis", 0...0x10000, Encoding::Shift_JIS)
7070
when "utf8" then table(Encoding::UTF_8) and lists("utf8", 0x100...0x110000, Encoding::UTF_8)
7171
when "utf8-mac" then table(Encoding::UTF8_MAC) and lists("utf8-mac", 0x100...0x110000, Encoding::UTF8_MAC)
72+
when "windows-1250" then table(Encoding::Windows_1250)
7273
when "windows-1251" then table(Encoding::Windows_1251)
7374
when "windows-1252" then table(Encoding::Windows_1252)
74-
when "windows-31j" then lists("windows-31j", 0...0x10000, Encoding::Windows_31)
75+
when "windows-31j" then lists("windows-31j", 0...0x10000, Encoding::Windows_31J)
7576
else raise "Unknown encoding `#{ARGV[0]}'"
7677
end

docs/encoding.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ The key of the comment can be either "encoding" or "coding". The value of the co
4141
* `utf-8`
4242
* `utf8-mac`
4343
* `windows-31j`
44+
* `windows-1250`
4445
* `windows-1251`
4546
* `windows-1252`
4647

include/prism/enc/pm_encoding.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,8 +180,9 @@ extern pm_encoding_t pm_encoding_koi8_r;
180180
extern pm_encoding_t pm_encoding_shift_jis;
181181
extern pm_encoding_t pm_encoding_utf_8;
182182
extern pm_encoding_t pm_encoding_utf8_mac;
183-
extern pm_encoding_t pm_encoding_windows_31j;
183+
extern pm_encoding_t pm_encoding_windows_1250;
184184
extern pm_encoding_t pm_encoding_windows_1251;
185185
extern pm_encoding_t pm_encoding_windows_1252;
186+
extern pm_encoding_t pm_encoding_windows_31j;
186187

187188
#endif

src/enc/pm_tables.c

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -408,6 +408,30 @@ static uint8_t pm_encoding_koi8_r_table[256] = {
408408
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Fx
409409
};
410410

411+
/**
412+
* Each element of the following table contains a bitfield that indicates a
413+
* piece of information about the corresponding windows-1250 character.
414+
*/
415+
static uint8_t pm_encoding_windows_1250_table[256] = {
416+
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
417+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
418+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
419+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x
420+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x
421+
0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x
422+
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x
423+
0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x
424+
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x
425+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 7, 7, 7, 7, // 8x
426+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 3, 3, 3, // 9x
427+
0, 0, 0, 7, 0, 7, 0, 0, 0, 0, 7, 0, 0, 0, 0, 7, // Ax
428+
0, 0, 0, 3, 0, 3, 0, 0, 0, 3, 3, 0, 7, 0, 3, 3, // Bx
429+
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx
430+
7, 7, 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 3, // Dx
431+
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex
432+
3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx
433+
};
434+
411435
/**
412436
* Each element of the following table contains a bitfield that indicates a
413437
* piece of information about the corresponding windows-1251 character.
@@ -537,6 +561,7 @@ PRISM_ENCODING_TABLE(iso_8859_14)
537561
PRISM_ENCODING_TABLE(iso_8859_15)
538562
PRISM_ENCODING_TABLE(iso_8859_16)
539563
PRISM_ENCODING_TABLE(koi8_r)
564+
PRISM_ENCODING_TABLE(windows_1250)
540565
PRISM_ENCODING_TABLE(windows_1251)
541566
PRISM_ENCODING_TABLE(windows_1252)
542567

@@ -722,6 +747,16 @@ pm_encoding_t pm_encoding_koi8_r = {
722747
.multibyte = false
723748
};
724749

750+
/** Windows-1250 */
751+
pm_encoding_t pm_encoding_windows_1250 = {
752+
.name = "windows-1250",
753+
.char_width = pm_encoding_single_char_width,
754+
.alnum_char = pm_encoding_windows_1250_alnum_char,
755+
.alpha_char = pm_encoding_windows_1250_alpha_char,
756+
.isupper_char = pm_encoding_windows_1250_isupper_char,
757+
.multibyte = false
758+
};
759+
725760
/** Windows-1251 */
726761
pm_encoding_t pm_encoding_windows_1251 = {
727762
.name = "windows-1251",

src/prism.c

Lines changed: 55 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -6022,10 +6022,18 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star
60226022
}
60236023

60246024
// Next, we're going to check for UTF-8. This is the most common encoding.
6025-
// Extensions like utf-8 can contain extra encoding details like,
6026-
// utf-8-dos, utf-8-linux, utf-8-mac. We treat these all as utf-8 should
6027-
// treat any encoding starting utf-8 as utf-8.
6025+
// utf-8 can contain extra information at the end about the platform it is
6026+
// encoded on, such as utf-8-mac or utf-8-unix. We'll ignore those suffixes.
60286027
if ((start + 5 <= end) && (pm_strncasecmp(start, (const uint8_t *) "utf-8", 5) == 0)) {
6028+
// We need to explicitly handle utf-8-hfs, as that one needs to switch
6029+
// over to being utf8-mac.
6030+
if (width == 9 && (pm_strncasecmp(start + 5, (const uint8_t *) "-hfs", 4) == 0)) {
6031+
parser->encoding = pm_encoding_utf8_mac;
6032+
parser->encoding_changed = true;
6033+
if (parser->encoding_changed_callback != NULL) parser->encoding_changed_callback(parser);
6034+
return true;
6035+
}
6036+
60296037
// We don't need to do anything here because the default encoding is
60306038
// already UTF-8. We'll just return.
60316039
return true;
@@ -6036,48 +6044,58 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star
60366044
#define ENCODING(value, prebuilt) \
60376045
if (width == sizeof(value) - 1 && start + width <= end && pm_strncasecmp(start, (const uint8_t *) value, width) == 0) { \
60386046
parser->encoding = prebuilt; \
6039-
parser->encoding_changed |= true; \
6047+
parser->encoding_changed = true; \
60406048
if (parser->encoding_changed_callback != NULL) parser->encoding_changed_callback(parser); \
60416049
return true; \
60426050
}
60436051

6052+
// Built convenience macros to compare aliases for the same encoding.
6053+
#define ENCODING2(value1, value2, prebuilt) ENCODING(value1, prebuilt) ENCODING(value2, prebuilt)
6054+
#define ENCODING3(value1, value2, value3, prebuilt) ENCODING2(value1, value2, prebuilt) ENCODING(value3, prebuilt)
6055+
#define ENCODING4(value1, value2, value3, value4, prebuilt) ENCODING3(value1, value2, value3, prebuilt) ENCODING(value4, prebuilt)
6056+
#define ENCODING5(value1, value2, value3, value4, value5, prebuilt) ENCODING4(value1, value2, value3, value4, prebuilt) ENCODING(value5, prebuilt)
6057+
60446058
// Check most common first. (This is pretty arbitrary.)
6045-
ENCODING("ascii", pm_encoding_ascii);
6046-
ENCODING("ascii-8bit", pm_encoding_ascii_8bit);
6047-
ENCODING("us-ascii", pm_encoding_ascii);
6048-
ENCODING("binary", pm_encoding_ascii_8bit);
6049-
ENCODING("shift_jis", pm_encoding_shift_jis);
6050-
ENCODING("euc-jp", pm_encoding_euc_jp);
6059+
ENCODING("ASCII", pm_encoding_ascii);
6060+
ENCODING("ASCII-8BIT", pm_encoding_ascii_8bit);
6061+
ENCODING("US-ASCII", pm_encoding_ascii);
6062+
ENCODING("BINARY", pm_encoding_ascii_8bit);
6063+
ENCODING("Shift_JIS", pm_encoding_shift_jis);
6064+
ENCODING("EUC-JP", pm_encoding_euc_jp);
60516065

60526066
// Then check all the others.
6053-
ENCODING("big5", pm_encoding_big5);
6067+
ENCODING2("ANSI_X3.4-1968", "646", pm_encoding_ascii);
60546068
ENCODING("cp51932", pm_encoding_cp51932);
6055-
ENCODING("gbk", pm_encoding_gbk);
6056-
ENCODING("iso-8859-1", pm_encoding_iso_8859_1);
6057-
ENCODING("iso-8859-2", pm_encoding_iso_8859_2);
6058-
ENCODING("iso-8859-3", pm_encoding_iso_8859_3);
6059-
ENCODING("iso-8859-4", pm_encoding_iso_8859_4);
6060-
ENCODING("iso-8859-5", pm_encoding_iso_8859_5);
6061-
ENCODING("iso-8859-6", pm_encoding_iso_8859_6);
6062-
ENCODING("iso-8859-7", pm_encoding_iso_8859_7);
6063-
ENCODING("iso-8859-8", pm_encoding_iso_8859_8);
6064-
ENCODING("iso-8859-9", pm_encoding_iso_8859_9);
6065-
ENCODING("iso-8859-10", pm_encoding_iso_8859_10);
6066-
ENCODING("iso-8859-11", pm_encoding_iso_8859_11);
6067-
ENCODING("iso-8859-13", pm_encoding_iso_8859_13);
6068-
ENCODING("iso-8859-14", pm_encoding_iso_8859_14);
6069-
ENCODING("iso-8859-15", pm_encoding_iso_8859_15);
6070-
ENCODING("iso-8859-16", pm_encoding_iso_8859_16);
6071-
ENCODING("koi8-r", pm_encoding_koi8_r);
6072-
ENCODING("windows-31j", pm_encoding_windows_31j);
6073-
ENCODING("windows-1251", pm_encoding_windows_1251);
6074-
ENCODING("windows-1252", pm_encoding_windows_1252);
6075-
ENCODING("cp1251", pm_encoding_windows_1251);
6076-
ENCODING("cp1252", pm_encoding_windows_1252);
6077-
ENCODING("cp932", pm_encoding_windows_31j);
6078-
ENCODING("sjis", pm_encoding_windows_31j);
6079-
ENCODING("utf8-mac", pm_encoding_utf8_mac);
6080-
6069+
ENCODING("eucJP", pm_encoding_euc_jp);
6070+
ENCODING("Big5", pm_encoding_big5);
6071+
ENCODING2("GBK", "CP936", pm_encoding_gbk);
6072+
ENCODING2("ISO-8859-1", "ISO8859-1", pm_encoding_iso_8859_1);
6073+
ENCODING2("ISO-8859-2", "ISO8859-2", pm_encoding_iso_8859_2);
6074+
ENCODING2("ISO-8859-3", "ISO8859-3", pm_encoding_iso_8859_3);
6075+
ENCODING2("ISO-8859-4", "ISO8859-4", pm_encoding_iso_8859_4);
6076+
ENCODING2("ISO-8859-5", "ISO8859-5", pm_encoding_iso_8859_5);
6077+
ENCODING2("ISO-8859-6", "ISO8859-6", pm_encoding_iso_8859_6);
6078+
ENCODING2("ISO-8859-7", "ISO8859-7", pm_encoding_iso_8859_7);
6079+
ENCODING2("ISO-8859-8", "ISO8859-8", pm_encoding_iso_8859_8);
6080+
ENCODING2("ISO-8859-9", "ISO8859-9", pm_encoding_iso_8859_9);
6081+
ENCODING2("ISO-8859-10", "ISO8859-10", pm_encoding_iso_8859_10);
6082+
ENCODING2("ISO-8859-11", "ISO8859-11", pm_encoding_iso_8859_11);
6083+
ENCODING2("ISO-8859-13", "ISO8859-13", pm_encoding_iso_8859_13);
6084+
ENCODING2("ISO-8859-14", "ISO8859-14", pm_encoding_iso_8859_14);
6085+
ENCODING2("ISO-8859-15", "ISO8859-15", pm_encoding_iso_8859_15);
6086+
ENCODING2("ISO-8859-16", "ISO8859-16", pm_encoding_iso_8859_16);
6087+
ENCODING2("KOI8-R", "CP878", pm_encoding_koi8_r);
6088+
ENCODING4("CP65001", "locale", "external", "filesystem", pm_encoding_utf_8);
6089+
ENCODING3("UTF8-MAC", "UTF-8-MAC", "UTF-8-HFS", pm_encoding_utf8_mac);
6090+
ENCODING2("Windows-1250", "CP1250", pm_encoding_windows_1250);
6091+
ENCODING2("Windows-1251", "CP1251", pm_encoding_windows_1251);
6092+
ENCODING2("Windows-1252", "CP1252", pm_encoding_windows_1252);
6093+
ENCODING5("Windows-31J", "CP932", "csWindows31J", "SJIS", "PCK", pm_encoding_windows_31j);
6094+
6095+
#undef ENCODING2
6096+
#undef ENCODING3
6097+
#undef ENCODING4
6098+
#undef ENCODING5
60816099
#undef ENCODING
60826100

60836101
return false;

test/prism/encoding_test.rb

Lines changed: 43 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -4,45 +4,49 @@
44

55
module Prism
66
class EncodingTest < TestCase
7-
%w[
8-
ascii
9-
ascii-8bit
10-
big5
11-
binary
12-
euc-jp
13-
gbk
14-
iso-8859-1
15-
iso-8859-2
16-
iso-8859-3
17-
iso-8859-4
18-
iso-8859-5
19-
iso-8859-6
20-
iso-8859-7
21-
iso-8859-8
22-
iso-8859-9
23-
iso-8859-10
24-
iso-8859-11
25-
iso-8859-13
26-
iso-8859-14
27-
iso-8859-15
28-
iso-8859-16
29-
koi8-r
30-
shift_jis
31-
sjis
32-
us-ascii
33-
utf-8
34-
utf8-mac
35-
windows-31j
36-
windows-1251
37-
windows-1252
38-
CP1251
39-
CP1252
40-
CP51932
41-
].each do |encoding|
42-
define_method "test_encoding_#{encoding}" do
43-
result = Prism.parse("# encoding: #{encoding}\n'string'")
44-
actual = result.value.statements.body.first.unescaped.encoding
45-
assert_equal Encoding.find(encoding), actual
7+
[
8+
"US-ASCII",
9+
"ASCII-8BIT",
10+
"Big5",
11+
"CP51932",
12+
"EUC-JP",
13+
"GBK",
14+
"ISO-8859-1",
15+
"ISO-8859-2",
16+
"ISO-8859-3",
17+
"ISO-8859-4",
18+
"ISO-8859-5",
19+
"ISO-8859-6",
20+
"ISO-8859-7",
21+
"ISO-8859-8",
22+
"ISO-8859-9",
23+
"ISO-8859-10",
24+
"ISO-8859-11",
25+
"ISO-8859-13",
26+
"ISO-8859-14",
27+
"ISO-8859-15",
28+
"ISO-8859-16",
29+
"KOI8-R",
30+
"Shift_JIS",
31+
"Windows-31J",
32+
"UTF-8",
33+
"UTF8-MAC",
34+
"Windows-1250",
35+
"Windows-1251",
36+
"Windows-1252",
37+
].each do |canonical_name|
38+
encoding = Encoding.find(canonical_name)
39+
40+
encoding.names.each do |name|
41+
# Even though UTF-8-MAC is an alias for UTF8-MAC, CRuby treats it as
42+
# UTF-8. So we'll skip this test.
43+
next if name == "UTF-8-MAC"
44+
45+
define_method "test_encoding_#{name}" do
46+
result = Prism.parse("# encoding: #{name}\n'string'")
47+
actual = result.value.statements.body.first.unescaped.encoding
48+
assert_equal encoding, actual
49+
end
4650
end
4751
end
4852

0 commit comments

Comments
 (0)