Skip to content

Commit 0670dd3

Browse files
committed
add Windows-874 encoding
1 parent 5da0672 commit 0670dd3

File tree

6 files changed

+41
-0
lines changed

6 files changed

+41
-0
lines changed

bin/encodings

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,5 +96,6 @@ when "windows-1256" then table(Encoding::Windows_1256)
9696
when "windows-1257" then table(Encoding::Windows_1257)
9797
when "windows-1258" then table(Encoding::Windows_1258)
9898
when "windows-31j" then lists("windows-31j", 0...0x10000, Encoding::Windows_31J)
99+
when "windows-874" then table(Encoding::Windows_874)
99100
else raise "Unknown encoding `#{ARGV[0]}'"
100101
end

docs/encoding.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ The key of the comment can be either "encoding" or "coding". The value of the co
6363
* `Windows-1257`
6464
* `Windows-1258`
6565
* `Windows-31J`
66+
* `Windows-874`
6667

6768
For each of these encodings, prism provides a function for checking if the subsequent bytes form an alphabetic or alphanumeric character.
6869

include/prism/enc/pm_encoding.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,5 +206,6 @@ extern pm_encoding_t pm_encoding_windows_1256;
206206
extern pm_encoding_t pm_encoding_windows_1257;
207207
extern pm_encoding_t pm_encoding_windows_1258;
208208
extern pm_encoding_t pm_encoding_windows_31j;
209+
extern pm_encoding_t pm_encoding_windows_874;
209210

210211
#endif

src/enc/pm_tables.c

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1008,6 +1008,30 @@ static uint8_t pm_encoding_windows_1258_table[256] = {
10081008
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx
10091009
};
10101010

1011+
/**
1012+
* Each element of the following table contains a bitfield that indicates a
1013+
* piece of information about the corresponding windows-874 character.
1014+
*/
1015+
static uint8_t pm_encoding_windows_874_table[256] = {
1016+
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
1017+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
1018+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
1019+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x
1020+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x
1021+
0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x
1022+
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x
1023+
0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x
1024+
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x
1025+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x
1026+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x
1027+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax
1028+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx
1029+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx
1030+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx
1031+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex
1032+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx
1033+
};
1034+
10111035
/**
10121036
* Returns the size of the next character in the ASCII encoding. This basically
10131037
* means that if the top bit is not set, the character is 1 byte long.
@@ -1114,6 +1138,7 @@ PRISM_ENCODING_TABLE(windows_1255)
11141138
PRISM_ENCODING_TABLE(windows_1256)
11151139
PRISM_ENCODING_TABLE(windows_1257)
11161140
PRISM_ENCODING_TABLE(windows_1258)
1141+
PRISM_ENCODING_TABLE(windows_874)
11171142

11181143
#undef PRISM_ENCODING_TABLE
11191144

@@ -1546,3 +1571,13 @@ pm_encoding_t pm_encoding_windows_1258 = {
15461571
.isupper_char = pm_encoding_windows_1258_isupper_char,
15471572
.multibyte = false
15481573
};
1574+
1575+
/** Windows-874 */
1576+
pm_encoding_t pm_encoding_windows_874 = {
1577+
.name = "Windows-874",
1578+
.char_width = pm_encoding_single_char_width,
1579+
.alnum_char = pm_encoding_windows_874_alnum_char,
1580+
.alpha_char = pm_encoding_windows_874_alpha_char,
1581+
.isupper_char = pm_encoding_windows_874_isupper_char,
1582+
.multibyte = false
1583+
};

src/prism.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6075,6 +6075,7 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star
60756075
ENCODING1("CP860", pm_encoding_ibm860);
60766076
ENCODING1("CP861", pm_encoding_ibm861);
60776077
ENCODING1("CP862", pm_encoding_ibm862);
6078+
ENCODING1("CP874", pm_encoding_windows_874);
60786079
ENCODING1("CP878", pm_encoding_koi8_r);
60796080
ENCODING2("CP932", "csWindows31J", pm_encoding_windows_31j);
60806081
ENCODING1("CP936", pm_encoding_gbk);
@@ -6152,6 +6153,7 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star
61526153
break;
61536154
case 'W': case 'w':
61546155
ENCODING1("Windows-31J", pm_encoding_windows_31j);
6156+
ENCODING1("Windows-874", pm_encoding_windows_874);
61556157
ENCODING1("Windows-1250", pm_encoding_windows_1250);
61566158
ENCODING1("Windows-1251", pm_encoding_windows_1251);
61576159
ENCODING1("Windows-1252", pm_encoding_windows_1252);

test/prism/encoding_test.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ class EncodingTest < TestCase
5050
Encoding::Windows_1256 => 0x00...0x100,
5151
Encoding::Windows_1257 => 0x00...0x100,
5252
Encoding::Windows_1258 => 0x00...0x100,
53+
Encoding::Windows_874 => 0x00...0x100,
5354
Encoding::Big5 => 0x00...0x10000,
5455
Encoding::CP51932 => 0x00...0x10000,
5556
Encoding::GBK => 0x00...0x10000,

0 commit comments

Comments
 (0)