Skip to content

Commit

Permalink
[ruby/prism] Triple-check prism encodings
Browse files Browse the repository at this point in the history
  • Loading branch information
kddnewton authored and matzbot committed Feb 26, 2024
1 parent f541223 commit 34bad6d
Showing 1 changed file with 101 additions and 36 deletions.
137 changes: 101 additions & 36 deletions prism/encoding.c
Expand Up @@ -1499,7 +1499,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP
0x31350, 0x323AF,
};

#define UNICODE_ISUPPER_CODEPOINTS_LENGTH 1296
#define UNICODE_ISUPPER_CODEPOINTS_LENGTH 1302
static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_CODEPOINTS_LENGTH] = {
0x100, 0x100,
0x102, 0x102,
Expand Down Expand Up @@ -1582,9 +1582,9 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C
0x1B5, 0x1B5,
0x1B7, 0x1B8,
0x1BC, 0x1BC,
0x1C4, 0x1C4,
0x1C7, 0x1C7,
0x1CA, 0x1CA,
0x1C4, 0x1C5,
0x1C7, 0x1C8,
0x1CA, 0x1CB,
0x1CD, 0x1CD,
0x1CF, 0x1CF,
0x1D1, 0x1D1,
Expand All @@ -1602,7 +1602,7 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C
0x1EA, 0x1EA,
0x1EC, 0x1EC,
0x1EE, 0x1EE,
0x1F1, 0x1F1,
0x1F1, 0x1F2,
0x1F4, 0x1F4,
0x1F6, 0x1F8,
0x1FA, 0x1FA,
Expand Down Expand Up @@ -1910,11 +1910,14 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C
0x1F5D, 0x1F5D,
0x1F5F, 0x1F5F,
0x1F68, 0x1F6F,
0x1FB8, 0x1FBB,
0x1FC8, 0x1FCB,
0x1F88, 0x1F8F,
0x1F98, 0x1F9F,
0x1FA8, 0x1FAF,
0x1FB8, 0x1FBC,
0x1FC8, 0x1FCC,
0x1FD8, 0x1FDB,
0x1FE8, 0x1FEC,
0x1FF8, 0x1FFB,
0x1FF8, 0x1FFC,
0x2102, 0x2102,
0x2107, 0x2107,
0x210B, 0x210D,
Expand Down Expand Up @@ -2455,7 +2458,7 @@ pm_encoding_cesu_8_isupper_char(const uint8_t *b, ptrdiff_t n) {

/**
* Each element of the following table contains a bitfield that indicates a
* piece of information about the corresponding ASCII character.
* piece of information about the corresponding US-ASCII character.
*/
static const uint8_t pm_encoding_ascii_table[256] = {
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
Expand Down Expand Up @@ -3624,7 +3627,7 @@ static const uint8_t pm_encoding_windows_1250_table[256] = {
0, 0, 0, 7, 0, 7, 0, 0, 0, 0, 7, 0, 0, 0, 0, 7, // Ax
0, 0, 0, 3, 0, 3, 0, 0, 0, 3, 3, 0, 7, 0, 3, 3, // Bx
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx
7, 7, 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 3, // Dx
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, // Dx
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex
3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx
};
Expand Down Expand Up @@ -3672,7 +3675,7 @@ static const uint8_t pm_encoding_windows_1252_table[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, // Ax
0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, // Bx
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx
7, 7, 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 3, // Dx
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, // Dx
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex
3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, // Fx
};
Expand Down Expand Up @@ -4022,7 +4025,7 @@ pm_encoding_cp949_char_width(const uint8_t *b, ptrdiff_t n) {
}

// These are the double byte characters
if ((n > 1) && (b[0] >= 0x81 && b[0] <= 0xfe) && (b[1] >= 0x41 && b[1] <= 0xfe)) {
if ((n > 1) && (b[0] >= 0x81 && b[0] <= 0xFE) && ((b[1] >= 0x41 && b[1] <= 0x5A) || (b[1] >= 0x61 && b[1] <= 0x7A) || (b[1] >= 0x81 && b[1] <= 0xFE))) {
return 2;
}

Expand Down Expand Up @@ -4096,6 +4099,27 @@ pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) {
return 0;
}

/**
* Returns the size of the next character in the EUC-JP encoding if it is an
* uppercase character.
*/
static bool
pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) {
size_t width = pm_encoding_euc_jp_char_width(b, n);

if (width == 1) {
return pm_encoding_ascii_isupper_char(b, n);
} else if (width == 2) {
return (
(b[0] == 0xA3 && b[1] >= 0xC1 && b[1] <= 0xDA) ||
(b[0] == 0xA6 && b[1] >= 0xA1 && b[1] <= 0xB8) ||
(b[0] == 0xA7 && b[1] >= 0xA1 && b[1] <= 0xC1)
);
} else {
return false;
}
}

/**
* Returns the size of the next character in the EUC-KR encoding, or 0 if a
* character cannot be decoded from the given bytes.
Expand Down Expand Up @@ -4201,18 +4225,59 @@ pm_encoding_gbk_char_width(const uint8_t *b, ptrdiff_t n) {
static size_t
pm_encoding_shift_jis_char_width(const uint8_t *b, ptrdiff_t n) {
// These are the single byte characters.
if (*b < 0x80 || (*b >= 0xA1 && *b <= 0xDF)) {
if (b[0] < 0x80 || (b[0] >= 0xA1 && b[0] <= 0xDF)) {
return 1;
}

// These are the double byte characters.
if ((n > 1) && ((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) && (b[1] >= 0x40 && b[1] <= 0xFC)) {
if ((n > 1) && ((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) && (b[1] >= 0x40 && b[1] <= 0xFC && b[1] != 0x7F)) {
return 2;
}

return 0;
}

/**
* Returns the size of the next character in the Shift_JIS encoding if it is an
* alphanumeric character.
*/
static size_t
pm_encoding_shift_jis_alnum_char(const uint8_t *b, ptrdiff_t n) {
size_t width = pm_encoding_shift_jis_char_width(b, n);
return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alnum_char(b, n)) : width;
}

/**
* Returns the size of the next character in the Shift_JIS encoding if it is an
* alphabetical character.
*/
static size_t
pm_encoding_shift_jis_alpha_char(const uint8_t *b, ptrdiff_t n) {
size_t width = pm_encoding_shift_jis_char_width(b, n);
return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alpha_char(b, n)) : width;
}

/**
* Returns the size of the next character in the Shift_JIS encoding if it is an
* uppercase character.
*/
static bool
pm_encoding_shift_jis_isupper_char(const uint8_t *b, ptrdiff_t n) {
size_t width = pm_encoding_shift_jis_char_width(b, n);

if (width == 1) {
return pm_encoding_ascii_isupper_char(b, n);
} else if (width == 2) {
return (
((b[0] == 0x82) && (b[1] >= 0x60 && b[1] <= 0x79)) ||
((b[0] == 0x83) && (b[1] >= 0x9F && b[1] <= 0xB6)) ||
((b[0] == 0x84) && (b[1] >= 0x40 && b[1] <= 0x60))
);
} else {
return width;
}
}

/**
* This is the table of all of the encodings that prism supports.
*/
Expand Down Expand Up @@ -4270,7 +4335,7 @@ const pm_encoding_t pm_encodings[] = {
.char_width = pm_encoding_euc_jp_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
.alpha_char = pm_encoding_ascii_alpha_char_7bit,
.isupper_char = pm_encoding_ascii_isupper_char_7bit,
.isupper_char = pm_encoding_euc_jp_isupper_char,
.multibyte = true
},
[PM_ENCODING_CP850] = {
Expand Down Expand Up @@ -4334,23 +4399,23 @@ const pm_encoding_t pm_encodings[] = {
.char_width = pm_encoding_euc_jp_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
.alpha_char = pm_encoding_ascii_alpha_char_7bit,
.isupper_char = pm_encoding_ascii_isupper_char_7bit,
.isupper_char = pm_encoding_euc_jp_isupper_char,
.multibyte = true
},
[PM_ENCODING_EUC_JP_MS] = {
.name = "eucJP-ms",
.char_width = pm_encoding_euc_jp_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
.alpha_char = pm_encoding_ascii_alpha_char_7bit,
.isupper_char = pm_encoding_ascii_isupper_char_7bit,
.isupper_char = pm_encoding_euc_jp_isupper_char,
.multibyte = true
},
[PM_ENCODING_EUC_JIS_2004] = {
.name = "EUC-JIS-2004",
.char_width = pm_encoding_euc_jp_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
.alpha_char = pm_encoding_ascii_alpha_char_7bit,
.isupper_char = pm_encoding_ascii_isupper_char_7bit,
.isupper_char = pm_encoding_euc_jp_isupper_char,
.multibyte = true
},
[PM_ENCODING_EUC_KR] = {
Expand Down Expand Up @@ -4708,9 +4773,9 @@ const pm_encoding_t pm_encodings[] = {
[PM_ENCODING_MAC_JAPANESE] = {
.name = "MacJapanese",
.char_width = pm_encoding_shift_jis_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
.alpha_char = pm_encoding_ascii_alpha_char_7bit,
.isupper_char = pm_encoding_ascii_isupper_char_7bit,
.alnum_char = pm_encoding_shift_jis_alnum_char,
.alpha_char = pm_encoding_shift_jis_alpha_char,
.isupper_char = pm_encoding_shift_jis_isupper_char,
.multibyte = true
},
[PM_ENCODING_MAC_ROMAN] = {
Expand Down Expand Up @@ -4756,33 +4821,33 @@ const pm_encoding_t pm_encodings[] = {
[PM_ENCODING_SHIFT_JIS] = {
.name = "Shift_JIS",
.char_width = pm_encoding_shift_jis_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
.alpha_char = pm_encoding_ascii_alpha_char_7bit,
.isupper_char = pm_encoding_ascii_isupper_char_7bit,
.alnum_char = pm_encoding_shift_jis_alnum_char,
.alpha_char = pm_encoding_shift_jis_alpha_char,
.isupper_char = pm_encoding_shift_jis_isupper_char,
.multibyte = true
},
[PM_ENCODING_SJIS_DOCOMO] = {
.name = "SJIS-DoCoMo",
.char_width = pm_encoding_shift_jis_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
.alpha_char = pm_encoding_ascii_alpha_char_7bit,
.isupper_char = pm_encoding_ascii_isupper_char_7bit,
.alnum_char = pm_encoding_shift_jis_alnum_char,
.alpha_char = pm_encoding_shift_jis_alpha_char,
.isupper_char = pm_encoding_shift_jis_isupper_char,
.multibyte = true
},
[PM_ENCODING_SJIS_KDDI] = {
.name = "SJIS-KDDI",
.char_width = pm_encoding_shift_jis_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
.alpha_char = pm_encoding_ascii_alpha_char_7bit,
.isupper_char = pm_encoding_ascii_isupper_char_7bit,
.alnum_char = pm_encoding_shift_jis_alnum_char,
.alpha_char = pm_encoding_shift_jis_alpha_char,
.isupper_char = pm_encoding_shift_jis_isupper_char,
.multibyte = true
},
[PM_ENCODING_SJIS_SOFTBANK] = {
.name = "SJIS-SoftBank",
.char_width = pm_encoding_shift_jis_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
.alpha_char = pm_encoding_ascii_alpha_char_7bit,
.isupper_char = pm_encoding_ascii_isupper_char_7bit,
.alnum_char = pm_encoding_shift_jis_alnum_char,
.alpha_char = pm_encoding_shift_jis_alpha_char,
.isupper_char = pm_encoding_shift_jis_isupper_char,
.multibyte = true
},
[PM_ENCODING_STATELESS_ISO_2022_JP] = {
Expand Down Expand Up @@ -4924,9 +4989,9 @@ const pm_encoding_t pm_encodings[] = {
[PM_ENCODING_WINDOWS_31J] = {
.name = "Windows-31J",
.char_width = pm_encoding_shift_jis_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
.alpha_char = pm_encoding_ascii_alpha_char_7bit,
.isupper_char = pm_encoding_ascii_isupper_char_7bit,
.alnum_char = pm_encoding_shift_jis_alnum_char,
.alpha_char = pm_encoding_shift_jis_alpha_char,
.isupper_char = pm_encoding_shift_jis_isupper_char,
.multibyte = true
},
[PM_ENCODING_WINDOWS_874] = {
Expand Down

0 comments on commit 34bad6d

Please sign in to comment.