@@ -6022,10 +6022,18 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star
6022
6022
}
6023
6023
6024
6024
// Next, we're going to check for UTF-8. This is the most common encoding.
6025
- // Extensions like utf-8 can contain extra encoding details like,
6026
- // utf-8-dos, utf-8-linux, utf-8-mac. We treat these all as utf-8 should
6027
- // treat any encoding starting utf-8 as utf-8.
6025
+ // utf-8 can contain extra information at the end about the platform it is
6026
+ // encoded on, such as utf-8-mac or utf-8-unix. We'll ignore those suffixes.
6028
6027
if ((start + 5 <= end) && (pm_strncasecmp(start, (const uint8_t *) "utf-8", 5) == 0)) {
6028
+ // We need to explicitly handle utf-8-hfs, as that one needs to switch
6029
+ // over to being utf8-mac.
6030
+ if (width == 9 && (pm_strncasecmp(start + 5, (const uint8_t *) "-hfs", 4) == 0)) {
6031
+ parser->encoding = pm_encoding_utf8_mac;
6032
+ parser->encoding_changed = true;
6033
+ if (parser->encoding_changed_callback != NULL) parser->encoding_changed_callback(parser);
6034
+ return true;
6035
+ }
6036
+
6029
6037
// We don't need to do anything here because the default encoding is
6030
6038
// already UTF-8. We'll just return.
6031
6039
return true;
@@ -6036,48 +6044,58 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star
6036
6044
#define ENCODING(value, prebuilt) \
6037
6045
if (width == sizeof(value) - 1 && start + width <= end && pm_strncasecmp(start, (const uint8_t *) value, width) == 0) { \
6038
6046
parser->encoding = prebuilt; \
6039
- parser->encoding_changed | = true; \
6047
+ parser->encoding_changed = true; \
6040
6048
if (parser->encoding_changed_callback != NULL) parser->encoding_changed_callback(parser); \
6041
6049
return true; \
6042
6050
}
6043
6051
6052
+ // Built convenience macros to compare aliases for the same encoding.
6053
+ #define ENCODING2(value1, value2, prebuilt) ENCODING(value1, prebuilt) ENCODING(value2, prebuilt)
6054
+ #define ENCODING3(value1, value2, value3, prebuilt) ENCODING2(value1, value2, prebuilt) ENCODING(value3, prebuilt)
6055
+ #define ENCODING4(value1, value2, value3, value4, prebuilt) ENCODING3(value1, value2, value3, prebuilt) ENCODING(value4, prebuilt)
6056
+ #define ENCODING5(value1, value2, value3, value4, value5, prebuilt) ENCODING4(value1, value2, value3, value4, prebuilt) ENCODING(value5, prebuilt)
6057
+
6044
6058
// Check most common first. (This is pretty arbitrary.)
6045
- ENCODING("ascii ", pm_encoding_ascii);
6046
- ENCODING("ascii-8bit ", pm_encoding_ascii_8bit);
6047
- ENCODING("us-ascii ", pm_encoding_ascii);
6048
- ENCODING("binary ", pm_encoding_ascii_8bit);
6049
- ENCODING("shift_jis ", pm_encoding_shift_jis);
6050
- ENCODING("euc-jp ", pm_encoding_euc_jp);
6059
+ ENCODING("ASCII ", pm_encoding_ascii);
6060
+ ENCODING("ASCII-8BIT ", pm_encoding_ascii_8bit);
6061
+ ENCODING("US-ASCII ", pm_encoding_ascii);
6062
+ ENCODING("BINARY ", pm_encoding_ascii_8bit);
6063
+ ENCODING("Shift_JIS ", pm_encoding_shift_jis);
6064
+ ENCODING("EUC-JP ", pm_encoding_euc_jp);
6051
6065
6052
6066
// Then check all the others.
6053
- ENCODING("big5 ", pm_encoding_big5 );
6067
+ ENCODING2("ANSI_X3.4-1968 ", "646", pm_encoding_ascii );
6054
6068
ENCODING("cp51932", pm_encoding_cp51932);
6055
- ENCODING("gbk", pm_encoding_gbk);
6056
- ENCODING("iso-8859-1", pm_encoding_iso_8859_1);
6057
- ENCODING("iso-8859-2", pm_encoding_iso_8859_2);
6058
- ENCODING("iso-8859-3", pm_encoding_iso_8859_3);
6059
- ENCODING("iso-8859-4", pm_encoding_iso_8859_4);
6060
- ENCODING("iso-8859-5", pm_encoding_iso_8859_5);
6061
- ENCODING("iso-8859-6", pm_encoding_iso_8859_6);
6062
- ENCODING("iso-8859-7", pm_encoding_iso_8859_7);
6063
- ENCODING("iso-8859-8", pm_encoding_iso_8859_8);
6064
- ENCODING("iso-8859-9", pm_encoding_iso_8859_9);
6065
- ENCODING("iso-8859-10", pm_encoding_iso_8859_10);
6066
- ENCODING("iso-8859-11", pm_encoding_iso_8859_11);
6067
- ENCODING("iso-8859-13", pm_encoding_iso_8859_13);
6068
- ENCODING("iso-8859-14", pm_encoding_iso_8859_14);
6069
- ENCODING("iso-8859-15", pm_encoding_iso_8859_15);
6070
- ENCODING("iso-8859-16", pm_encoding_iso_8859_16);
6071
- ENCODING("koi8-r", pm_encoding_koi8_r);
6072
- ENCODING("windows-31j", pm_encoding_windows_31j);
6073
- ENCODING("windows-1251", pm_encoding_windows_1251);
6074
- ENCODING("windows-1252", pm_encoding_windows_1252);
6075
- ENCODING("cp1251", pm_encoding_windows_1251);
6076
- ENCODING("cp1252", pm_encoding_windows_1252);
6077
- ENCODING("cp932", pm_encoding_windows_31j);
6078
- ENCODING("sjis", pm_encoding_windows_31j);
6079
- ENCODING("utf8-mac", pm_encoding_utf8_mac);
6080
-
6069
+ ENCODING("eucJP", pm_encoding_euc_jp);
6070
+ ENCODING("Big5", pm_encoding_big5);
6071
+ ENCODING2("GBK", "CP936", pm_encoding_gbk);
6072
+ ENCODING2("ISO-8859-1", "ISO8859-1", pm_encoding_iso_8859_1);
6073
+ ENCODING2("ISO-8859-2", "ISO8859-2", pm_encoding_iso_8859_2);
6074
+ ENCODING2("ISO-8859-3", "ISO8859-3", pm_encoding_iso_8859_3);
6075
+ ENCODING2("ISO-8859-4", "ISO8859-4", pm_encoding_iso_8859_4);
6076
+ ENCODING2("ISO-8859-5", "ISO8859-5", pm_encoding_iso_8859_5);
6077
+ ENCODING2("ISO-8859-6", "ISO8859-6", pm_encoding_iso_8859_6);
6078
+ ENCODING2("ISO-8859-7", "ISO8859-7", pm_encoding_iso_8859_7);
6079
+ ENCODING2("ISO-8859-8", "ISO8859-8", pm_encoding_iso_8859_8);
6080
+ ENCODING2("ISO-8859-9", "ISO8859-9", pm_encoding_iso_8859_9);
6081
+ ENCODING2("ISO-8859-10", "ISO8859-10", pm_encoding_iso_8859_10);
6082
+ ENCODING2("ISO-8859-11", "ISO8859-11", pm_encoding_iso_8859_11);
6083
+ ENCODING2("ISO-8859-13", "ISO8859-13", pm_encoding_iso_8859_13);
6084
+ ENCODING2("ISO-8859-14", "ISO8859-14", pm_encoding_iso_8859_14);
6085
+ ENCODING2("ISO-8859-15", "ISO8859-15", pm_encoding_iso_8859_15);
6086
+ ENCODING2("ISO-8859-16", "ISO8859-16", pm_encoding_iso_8859_16);
6087
+ ENCODING2("KOI8-R", "CP878", pm_encoding_koi8_r);
6088
+ ENCODING4("CP65001", "locale", "external", "filesystem", pm_encoding_utf_8);
6089
+ ENCODING3("UTF8-MAC", "UTF-8-MAC", "UTF-8-HFS", pm_encoding_utf8_mac);
6090
+ ENCODING2("Windows-1250", "CP1250", pm_encoding_windows_1250);
6091
+ ENCODING2("Windows-1251", "CP1251", pm_encoding_windows_1251);
6092
+ ENCODING2("Windows-1252", "CP1252", pm_encoding_windows_1252);
6093
+ ENCODING5("Windows-31J", "CP932", "csWindows31J", "SJIS", "PCK", pm_encoding_windows_31j);
6094
+
6095
+ #undef ENCODING2
6096
+ #undef ENCODING3
6097
+ #undef ENCODING4
6098
+ #undef ENCODING5
6081
6099
#undef ENCODING
6082
6100
6083
6101
return false;
0 commit comments