From af7bf9e0d8fd5e542781ca7ac9550e3011913924 Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Wed, 20 Mar 2024 10:08:13 -0400 Subject: [PATCH] [ruby/prism] Provide options for reducing size https://github.com/ruby/prism/commit/592128de4d --- prism/encoding.c | 318 ++++++++++++++------------ prism/encoding.h | 12 +- prism/extension.c | 8 +- prism/pack.c | 50 ++-- prism/pack.h | 11 + prism/prettyprint.h | 8 + prism/prism.c | 44 +++- prism/prism.h | 13 ++ prism/templates/src/node.c.erb | 6 + prism/templates/src/prettyprint.c.erb | 11 + prism/templates/src/serialize.c.erb | 26 +-- test/prism/encoding_test.rb | 69 +++--- test/prism/fuzzer_test.rb | 5 +- test/prism/magic_comment_test.rb | 6 +- test/prism/parse_test.rb | 34 +-- test/prism/ruby_api_test.rb | 24 +- test/prism/static_inspect_test.rb | 2 +- 17 files changed, 393 insertions(+), 254 deletions(-) diff --git a/prism/encoding.c b/prism/encoding.c index dc63cccc2db2a4..a4aeed104f89b9 100644 --- a/prism/encoding.c +++ b/prism/encoding.c @@ -2358,6 +2358,8 @@ pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) { } } +#ifndef PRISM_ENCODING_EXCLUDE_FULL + static pm_unicode_codepoint_t pm_cesu_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) { if (b[0] < 0x80) { @@ -2452,6 +2454,8 @@ pm_encoding_cesu_8_isupper_char(const uint8_t *b, ptrdiff_t n) { } } +#endif + #undef UNICODE_ALPHA_CODEPOINTS_LENGTH #undef UNICODE_ALNUM_CODEPOINTS_LENGTH #undef UNICODE_ISUPPER_CODEPOINTS_LENGTH @@ -2480,6 +2484,8 @@ static const uint8_t pm_encoding_ascii_table[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx }; +#ifndef PRISM_ENCODING_EXCLUDE_FULL + /** * Each element of the following table contains a bitfield that indicates a * piece of information about the corresponding CP850 character. @@ -3918,6 +3924,7 @@ PRISM_ENCODING_TABLE(windows_1258) PRISM_ENCODING_TABLE(windows_874) #undef PRISM_ENCODING_TABLE +#endif /** * Returns the size of the next character in the ASCII encoding. This basically @@ -3976,22 +3983,129 @@ pm_encoding_ascii_isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_ } /** - * Certain encodings are equivalent to ASCII below 0x80, so it works for our - * purposes to have a function here that first checks the bounds and then falls - * back to checking the ASCII lookup table. + * For a lot of encodings the default is that they are a single byte long no + * matter what the codepoint, so this function is shared between them. + */ +static size_t +pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { + return 1; +} + +/** + * Returns the size of the next character in the EUC-JP encoding, or 0 if a + * character cannot be decoded from the given bytes. + */ +static size_t +pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) { + // These are the single byte characters. + if (*b < 0x80) { + return 1; + } + + // These are the double byte characters. + if ((n > 1) && ((b[0] == 0x8E) || (b[0] >= 0xA1 && b[0] <= 0xFE)) && (b[1] >= 0xA1 && b[1] <= 0xFE)) { + return 2; + } + + // These are the triple byte characters. + if ((n > 2) && (b[0] == 0x8F) && (b[1] >= 0xA1 && b[2] <= 0xFE) && (b[2] >= 0xA1 && b[2] <= 0xFE)) { + return 3; + } + + return 0; +} + +/** + * Returns the size of the next character in the EUC-JP encoding if it is an + * uppercase character. */ static bool -pm_encoding_ascii_isupper_char_7bit(const uint8_t *b, ptrdiff_t n) { - return (*b < 0x80) && pm_encoding_ascii_isupper_char(b, n); +pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) { + size_t width = pm_encoding_euc_jp_char_width(b, n); + + if (width == 1) { + return pm_encoding_ascii_isupper_char(b, n); + } else if (width == 2) { + return ( + (b[0] == 0xA3 && b[1] >= 0xC1 && b[1] <= 0xDA) || + (b[0] == 0xA6 && b[1] >= 0xA1 && b[1] <= 0xB8) || + (b[0] == 0xA7 && b[1] >= 0xA1 && b[1] <= 0xC1) + ); + } else { + return false; + } } /** - * For a lot of encodings the default is that they are a single byte long no - * matter what the codepoint, so this function is shared between them. + * Returns the size of the next character in the Shift_JIS encoding, or 0 if a + * character cannot be decoded from the given bytes. */ static size_t -pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { - return 1; +pm_encoding_shift_jis_char_width(const uint8_t *b, ptrdiff_t n) { + // These are the single byte characters. + if (b[0] < 0x80 || (b[0] >= 0xA1 && b[0] <= 0xDF)) { + return 1; + } + + // These are the double byte characters. + if ((n > 1) && ((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) && (b[1] >= 0x40 && b[1] <= 0xFC && b[1] != 0x7F)) { + return 2; + } + + return 0; +} + +/** + * Returns the size of the next character in the Shift_JIS encoding if it is an + * alphanumeric character. + */ +static size_t +pm_encoding_shift_jis_alnum_char(const uint8_t *b, ptrdiff_t n) { + size_t width = pm_encoding_shift_jis_char_width(b, n); + return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alnum_char(b, n)) : width; +} + +/** + * Returns the size of the next character in the Shift_JIS encoding if it is an + * alphabetical character. + */ +static size_t +pm_encoding_shift_jis_alpha_char(const uint8_t *b, ptrdiff_t n) { + size_t width = pm_encoding_shift_jis_char_width(b, n); + return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alpha_char(b, n)) : width; +} + +/** + * Returns the size of the next character in the Shift_JIS encoding if it is an + * uppercase character. + */ +static bool +pm_encoding_shift_jis_isupper_char(const uint8_t *b, ptrdiff_t n) { + size_t width = pm_encoding_shift_jis_char_width(b, n); + + if (width == 1) { + return pm_encoding_ascii_isupper_char(b, n); + } else if (width == 2) { + return ( + ((b[0] == 0x82) && (b[1] >= 0x60 && b[1] <= 0x79)) || + ((b[0] == 0x83) && (b[1] >= 0x9F && b[1] <= 0xB6)) || + ((b[0] == 0x84) && (b[1] >= 0x40 && b[1] <= 0x60)) + ); + } else { + return width; + } +} + +#ifndef PRISM_ENCODING_EXCLUDE_FULL + +/** + * Certain encodings are equivalent to ASCII below 0x80, so it works for our + * purposes to have a function here that first checks the bounds and then falls + * back to checking the ASCII lookup table. + */ +static bool +pm_encoding_ascii_isupper_char_7bit(const uint8_t *b, ptrdiff_t n) { + return (*b < 0x80) && pm_encoding_ascii_isupper_char(b, n); } /** @@ -4075,51 +4189,6 @@ pm_encoding_emacs_mule_char_width(const uint8_t *b, ptrdiff_t n) { return 0; } -/** - * Returns the size of the next character in the EUC-JP encoding, or 0 if a - * character cannot be decoded from the given bytes. - */ -static size_t -pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) { - // These are the single byte characters. - if (*b < 0x80) { - return 1; - } - - // These are the double byte characters. - if ((n > 1) && ((b[0] == 0x8E) || (b[0] >= 0xA1 && b[0] <= 0xFE)) && (b[1] >= 0xA1 && b[1] <= 0xFE)) { - return 2; - } - - // These are the triple byte characters. - if ((n > 2) && (b[0] == 0x8F) && (b[1] >= 0xA1 && b[2] <= 0xFE) && (b[2] >= 0xA1 && b[2] <= 0xFE)) { - return 3; - } - - return 0; -} - -/** - * Returns the size of the next character in the EUC-JP encoding if it is an - * uppercase character. - */ -static bool -pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) { - size_t width = pm_encoding_euc_jp_char_width(b, n); - - if (width == 1) { - return pm_encoding_ascii_isupper_char(b, n); - } else if (width == 2) { - return ( - (b[0] == 0xA3 && b[1] >= 0xC1 && b[1] <= 0xDA) || - (b[0] == 0xA6 && b[1] >= 0xA1 && b[1] <= 0xB8) || - (b[0] == 0xA7 && b[1] >= 0xA1 && b[1] <= 0xC1) - ); - } else { - return false; - } -} - /** * Returns the size of the next character in the EUC-KR encoding, or 0 if a * character cannot be decoded from the given bytes. @@ -4218,65 +4287,7 @@ pm_encoding_gbk_char_width(const uint8_t *b, ptrdiff_t n) { return 0; } -/** - * Returns the size of the next character in the Shift_JIS encoding, or 0 if a - * character cannot be decoded from the given bytes. - */ -static size_t -pm_encoding_shift_jis_char_width(const uint8_t *b, ptrdiff_t n) { - // These are the single byte characters. - if (b[0] < 0x80 || (b[0] >= 0xA1 && b[0] <= 0xDF)) { - return 1; - } - - // These are the double byte characters. - if ((n > 1) && ((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) && (b[1] >= 0x40 && b[1] <= 0xFC && b[1] != 0x7F)) { - return 2; - } - - return 0; -} - -/** - * Returns the size of the next character in the Shift_JIS encoding if it is an - * alphanumeric character. - */ -static size_t -pm_encoding_shift_jis_alnum_char(const uint8_t *b, ptrdiff_t n) { - size_t width = pm_encoding_shift_jis_char_width(b, n); - return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alnum_char(b, n)) : width; -} - -/** - * Returns the size of the next character in the Shift_JIS encoding if it is an - * alphabetical character. - */ -static size_t -pm_encoding_shift_jis_alpha_char(const uint8_t *b, ptrdiff_t n) { - size_t width = pm_encoding_shift_jis_char_width(b, n); - return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alpha_char(b, n)) : width; -} - -/** - * Returns the size of the next character in the Shift_JIS encoding if it is an - * uppercase character. - */ -static bool -pm_encoding_shift_jis_isupper_char(const uint8_t *b, ptrdiff_t n) { - size_t width = pm_encoding_shift_jis_char_width(b, n); - - if (width == 1) { - return pm_encoding_ascii_isupper_char(b, n); - } else if (width == 2) { - return ( - ((b[0] == 0x82) && (b[1] >= 0x60 && b[1] <= 0x79)) || - ((b[0] == 0x83) && (b[1] >= 0x9F && b[1] <= 0xB6)) || - ((b[0] == 0x84) && (b[1] >= 0x40 && b[1] <= 0x60)) - ); - } else { - return width; - } -} +#endif /** * This is the table of all of the encodings that prism supports. @@ -4290,6 +4301,14 @@ const pm_encoding_t pm_encodings[] = { .isupper_char = pm_encoding_utf_8_isupper_char, .multibyte = true }, + [PM_ENCODING_US_ASCII] = { + .name = "US-ASCII", + .char_width = pm_encoding_ascii_char_width, + .alnum_char = pm_encoding_ascii_alnum_char, + .alpha_char = pm_encoding_ascii_alpha_char, + .isupper_char = pm_encoding_ascii_isupper_char, + .multibyte = false + }, [PM_ENCODING_ASCII_8BIT] = { .name = "ASCII-8BIT", .char_width = pm_encoding_single_char_width, @@ -4298,6 +4317,24 @@ const pm_encoding_t pm_encodings[] = { .isupper_char = pm_encoding_ascii_isupper_char, .multibyte = false }, + [PM_ENCODING_EUC_JP] = { + .name = "EUC-JP", + .char_width = pm_encoding_euc_jp_char_width, + .alnum_char = pm_encoding_ascii_alnum_char_7bit, + .alpha_char = pm_encoding_ascii_alpha_char_7bit, + .isupper_char = pm_encoding_euc_jp_isupper_char, + .multibyte = true + }, + [PM_ENCODING_WINDOWS_31J] = { + .name = "Windows-31J", + .char_width = pm_encoding_shift_jis_char_width, + .alnum_char = pm_encoding_shift_jis_alnum_char, + .alpha_char = pm_encoding_shift_jis_alpha_char, + .isupper_char = pm_encoding_shift_jis_isupper_char, + .multibyte = true + }, + +#ifndef PRISM_ENCODING_EXCLUDE_FULL [PM_ENCODING_BIG5] = { .name = "Big5", .char_width = pm_encoding_big5_char_width, @@ -4394,14 +4431,6 @@ const pm_encoding_t pm_encodings[] = { .isupper_char = pm_encoding_ascii_isupper_char_7bit, .multibyte = true }, - [PM_ENCODING_EUC_JP] = { - .name = "EUC-JP", - .char_width = pm_encoding_euc_jp_char_width, - .alnum_char = pm_encoding_ascii_alnum_char_7bit, - .alpha_char = pm_encoding_ascii_alpha_char_7bit, - .isupper_char = pm_encoding_euc_jp_isupper_char, - .multibyte = true - }, [PM_ENCODING_EUC_JP_MS] = { .name = "eucJP-ms", .char_width = pm_encoding_euc_jp_char_width, @@ -4874,14 +4903,6 @@ const pm_encoding_t pm_encodings[] = { .isupper_char = pm_encoding_tis_620_isupper_char, .multibyte = false }, - [PM_ENCODING_US_ASCII] = { - .name = "US-ASCII", - .char_width = pm_encoding_ascii_char_width, - .alnum_char = pm_encoding_ascii_alnum_char, - .alpha_char = pm_encoding_ascii_alpha_char, - .isupper_char = pm_encoding_ascii_isupper_char, - .multibyte = false - }, [PM_ENCODING_UTF8_MAC] = { .name = "UTF8-MAC", .char_width = pm_encoding_utf_8_char_width, @@ -4986,14 +5007,6 @@ const pm_encoding_t pm_encodings[] = { .isupper_char = pm_encoding_windows_1258_isupper_char, .multibyte = false }, - [PM_ENCODING_WINDOWS_31J] = { - .name = "Windows-31J", - .char_width = pm_encoding_shift_jis_char_width, - .alnum_char = pm_encoding_shift_jis_alnum_char, - .alpha_char = pm_encoding_shift_jis_alpha_char, - .isupper_char = pm_encoding_shift_jis_isupper_char, - .multibyte = true - }, [PM_ENCODING_WINDOWS_874] = { .name = "Windows-874", .char_width = pm_encoding_single_char_width, @@ -5002,6 +5015,7 @@ const pm_encoding_t pm_encodings[] = { .isupper_char = pm_encoding_windows_874_isupper_char, .multibyte = false } +#endif }; /** @@ -5016,11 +5030,13 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) { // UTF-8 can contain extra information at the end about the platform it is // encoded on, such as UTF-8-MAC or UTF-8-UNIX. We'll ignore those suffixes. if ((start + 5 <= end) && (pm_strncasecmp(start, (const uint8_t *) "UTF-8", 5) == 0)) { +#ifndef PRISM_ENCODING_EXCLUDE_FULL // We need to explicitly handle UTF-8-HFS, as that one needs to switch // over to being UTF8-MAC. if (width == 9 && (pm_strncasecmp(start + 5, (const uint8_t *) "-HFS", 4) == 0)) { return &pm_encodings[PM_ENCODING_UTF8_MAC]; } +#endif // Otherwise we'll return the default UTF-8 encoding. return PM_ENCODING_UTF_8_ENTRY; @@ -5040,11 +5056,16 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) { break; case 'B': case 'b': ENCODING1("BINARY", PM_ENCODING_ASCII_8BIT); +#ifndef PRISM_ENCODING_EXCLUDE_FULL ENCODING1("Big5", PM_ENCODING_BIG5); ENCODING2("Big5-HKSCS", "Big5-HKSCS:2008", PM_ENCODING_BIG5_HKSCS); ENCODING1("Big5-UAO", PM_ENCODING_BIG5_UAO); +#endif break; case 'C': case 'c': + ENCODING1("CP65001", PM_ENCODING_UTF_8); + ENCODING2("CP932", "csWindows31J", PM_ENCODING_WINDOWS_31J); +#ifndef PRISM_ENCODING_EXCLUDE_FULL ENCODING1("CESU-8", PM_ENCODING_CESU_8); ENCODING1("CP437", PM_ENCODING_IBM437); ENCODING1("CP720", PM_ENCODING_IBM720); @@ -5064,7 +5085,6 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) { ENCODING1("CP874", PM_ENCODING_WINDOWS_874); ENCODING1("CP878", PM_ENCODING_KOI8_R); ENCODING1("CP863", PM_ENCODING_IBM863); - ENCODING2("CP932", "csWindows31J", PM_ENCODING_WINDOWS_31J); ENCODING1("CP936", PM_ENCODING_GBK); ENCODING1("CP949", PM_ENCODING_CP949); ENCODING1("CP950", PM_ENCODING_CP950); @@ -5079,25 +5099,30 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) { ENCODING1("CP1257", PM_ENCODING_WINDOWS_1257); ENCODING1("CP1258", PM_ENCODING_WINDOWS_1258); ENCODING1("CP51932", PM_ENCODING_CP51932); - ENCODING1("CP65001", PM_ENCODING_UTF_8); +#endif break; case 'E': case 'e': ENCODING2("EUC-JP", "eucJP", PM_ENCODING_EUC_JP); +#ifndef PRISM_ENCODING_EXCLUDE_FULL ENCODING2("eucJP-ms", "euc-jp-ms", PM_ENCODING_EUC_JP_MS); ENCODING2("EUC-JIS-2004", "EUC-JISX0213", PM_ENCODING_EUC_JIS_2004); ENCODING2("EUC-KR", "eucKR", PM_ENCODING_EUC_KR); ENCODING2("EUC-CN", "eucCN", PM_ENCODING_GB2312); ENCODING2("EUC-TW", "eucTW", PM_ENCODING_EUC_TW); ENCODING1("Emacs-Mule", PM_ENCODING_EMACS_MULE); +#endif break; case 'G': case 'g': +#ifndef PRISM_ENCODING_EXCLUDE_FULL ENCODING1("GBK", PM_ENCODING_GBK); ENCODING1("GB12345", PM_ENCODING_GB12345); ENCODING1("GB18030", PM_ENCODING_GB18030); ENCODING1("GB1988", PM_ENCODING_GB1988); ENCODING1("GB2312", PM_ENCODING_GB2312); +#endif break; case 'I': case 'i': +#ifndef PRISM_ENCODING_EXCLUDE_FULL ENCODING1("IBM437", PM_ENCODING_IBM437); ENCODING1("IBM720", PM_ENCODING_IBM720); ENCODING1("IBM737", PM_ENCODING_IBM737); @@ -5129,12 +5154,16 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) { ENCODING2("ISO-8859-14", "ISO8859-14", PM_ENCODING_ISO_8859_14); ENCODING2("ISO-8859-15", "ISO8859-15", PM_ENCODING_ISO_8859_15); ENCODING2("ISO-8859-16", "ISO8859-16", PM_ENCODING_ISO_8859_16); +#endif break; case 'K': case 'k': +#ifndef PRISM_ENCODING_EXCLUDE_FULL ENCODING1("KOI8-R", PM_ENCODING_KOI8_R); ENCODING1("KOI8-U", PM_ENCODING_KOI8_U); +#endif break; case 'M': case 'm': +#ifndef PRISM_ENCODING_EXCLUDE_FULL ENCODING1("macCentEuro", PM_ENCODING_MAC_CENT_EURO); ENCODING1("macCroatian", PM_ENCODING_MAC_CROATIAN); ENCODING1("macCyrillic", PM_ENCODING_MAC_CYRILLIC); @@ -5147,31 +5176,39 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) { ENCODING1("macThai", PM_ENCODING_MAC_THAI); ENCODING1("macTurkish", PM_ENCODING_MAC_TURKISH); ENCODING1("macUkraine", PM_ENCODING_MAC_UKRAINE); +#endif break; case 'P': case 'p': ENCODING1("PCK", PM_ENCODING_WINDOWS_31J); break; case 'S': case 's': - ENCODING1("Shift_JIS", PM_ENCODING_SHIFT_JIS); ENCODING1("SJIS", PM_ENCODING_WINDOWS_31J); +#ifndef PRISM_ENCODING_EXCLUDE_FULL + ENCODING1("Shift_JIS", PM_ENCODING_SHIFT_JIS); ENCODING1("SJIS-DoCoMo", PM_ENCODING_SJIS_DOCOMO); ENCODING1("SJIS-KDDI", PM_ENCODING_SJIS_KDDI); ENCODING1("SJIS-SoftBank", PM_ENCODING_SJIS_SOFTBANK); ENCODING1("stateless-ISO-2022-JP", PM_ENCODING_STATELESS_ISO_2022_JP); ENCODING1("stateless-ISO-2022-JP-KDDI", PM_ENCODING_STATELESS_ISO_2022_JP_KDDI); +#endif break; case 'T': case 't': +#ifndef PRISM_ENCODING_EXCLUDE_FULL ENCODING1("TIS-620", PM_ENCODING_TIS_620); +#endif break; case 'U': case 'u': ENCODING1("US-ASCII", PM_ENCODING_US_ASCII); +#ifndef PRISM_ENCODING_EXCLUDE_FULL ENCODING2("UTF8-MAC", "UTF-8-HFS", PM_ENCODING_UTF8_MAC); ENCODING1("UTF8-DoCoMo", PM_ENCODING_UTF8_DOCOMO); ENCODING1("UTF8-KDDI", PM_ENCODING_UTF8_KDDI); ENCODING1("UTF8-SoftBank", PM_ENCODING_UTF8_SOFTBANK); +#endif break; case 'W': case 'w': ENCODING1("Windows-31J", PM_ENCODING_WINDOWS_31J); +#ifndef PRISM_ENCODING_EXCLUDE_FULL ENCODING1("Windows-874", PM_ENCODING_WINDOWS_874); ENCODING1("Windows-1250", PM_ENCODING_WINDOWS_1250); ENCODING1("Windows-1251", PM_ENCODING_WINDOWS_1251); @@ -5182,6 +5219,7 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) { ENCODING1("Windows-1256", PM_ENCODING_WINDOWS_1256); ENCODING1("Windows-1257", PM_ENCODING_WINDOWS_1257); ENCODING1("Windows-1258", PM_ENCODING_WINDOWS_1258); +#endif break; case '6': ENCODING1("646", PM_ENCODING_US_ASCII); diff --git a/prism/encoding.h b/prism/encoding.h index 0850e291d81f7e..5f7724821f5b31 100644 --- a/prism/encoding.h +++ b/prism/encoding.h @@ -135,7 +135,14 @@ extern const uint8_t pm_encoding_unicode_table[256]; */ typedef enum { PM_ENCODING_UTF_8 = 0, + PM_ENCODING_US_ASCII, PM_ENCODING_ASCII_8BIT, + PM_ENCODING_EUC_JP, + PM_ENCODING_WINDOWS_31J, + +// We optionally support excluding the full set of encodings to only support the +// minimum necessary to process Ruby code without encoding comments. +#ifndef PRISM_ENCODING_EXCLUDE_FULL PM_ENCODING_BIG5, PM_ENCODING_BIG5_HKSCS, PM_ENCODING_BIG5_UAO, @@ -148,7 +155,6 @@ typedef enum { PM_ENCODING_CP950, PM_ENCODING_CP951, PM_ENCODING_EMACS_MULE, - PM_ENCODING_EUC_JP, PM_ENCODING_EUC_JP_MS, PM_ENCODING_EUC_JIS_2004, PM_ENCODING_EUC_KR, @@ -208,7 +214,6 @@ typedef enum { PM_ENCODING_STATELESS_ISO_2022_JP, PM_ENCODING_STATELESS_ISO_2022_JP_KDDI, PM_ENCODING_TIS_620, - PM_ENCODING_US_ASCII, PM_ENCODING_UTF8_MAC, PM_ENCODING_UTF8_DOCOMO, PM_ENCODING_UTF8_KDDI, @@ -222,8 +227,9 @@ typedef enum { PM_ENCODING_WINDOWS_1256, PM_ENCODING_WINDOWS_1257, PM_ENCODING_WINDOWS_1258, - PM_ENCODING_WINDOWS_31J, PM_ENCODING_WINDOWS_874, +#endif + PM_ENCODING_MAXIMUM } pm_encoding_type_t; diff --git a/prism/extension.c b/prism/extension.c index 3672ee0dc58d36..7c8636e3dfc15c 100644 --- a/prism/extension.c +++ b/prism/extension.c @@ -311,7 +311,7 @@ dump(int argc, VALUE *argv, VALUE self) { pm_options_t options = { 0 }; string_options(argc, argv, &input, &options); -#ifdef PRISM_DEBUG_MODE_BUILD +#ifdef PRISM_BUILD_DEBUG size_t length = pm_string_length(&input); char* dup = xmalloc(length); memcpy(dup, pm_string_source(&input), length); @@ -320,7 +320,7 @@ dump(int argc, VALUE *argv, VALUE self) { VALUE value = dump_input(&input, &options); -#ifdef PRISM_DEBUG_MODE_BUILD +#ifdef PRISM_BUILD_DEBUG xfree(dup); #endif @@ -737,7 +737,7 @@ parse(int argc, VALUE *argv, VALUE self) { pm_options_t options = { 0 }; string_options(argc, argv, &input, &options); -#ifdef PRISM_DEBUG_MODE_BUILD +#ifdef PRISM_BUILD_DEBUG size_t length = pm_string_length(&input); char* dup = xmalloc(length); memcpy(dup, pm_string_source(&input), length); @@ -746,7 +746,7 @@ parse(int argc, VALUE *argv, VALUE self) { VALUE value = parse_input(&input, &options); -#ifdef PRISM_DEBUG_MODE_BUILD +#ifdef PRISM_BUILD_DEBUG xfree(dup); #endif diff --git a/prism/pack.c b/prism/pack.c index d5bfc4d6fdf97d..1388ca8a3b5210 100644 --- a/prism/pack.c +++ b/prism/pack.c @@ -1,16 +1,43 @@ #include "prism/pack.h" +// We optionally support parsing String#pack templates. For systems that don't +// want or need this functionality, it can be turned off with the +// PRISM_EXCLUDE_PACK define. +#ifdef PRISM_EXCLUDE_PACK + +void pm_pack_parse(void) {} + +#else + #include #include static uintmax_t -strtoumaxc(const char **format); +strtoumaxc(const char **format) { + uintmax_t value = 0; + while (**format >= '0' && **format <= '9') { + if (value > UINTMAX_MAX / 10) { + errno = ERANGE; + } + value = value * 10 + ((uintmax_t) (**format - '0')); + (*format)++; + } + return value; +} PRISM_EXPORTED_FUNCTION pm_pack_result -pm_pack_parse(pm_pack_variant variant, const char **format, const char *format_end, - pm_pack_type *type, pm_pack_signed *signed_type, pm_pack_endian *endian, pm_pack_size *size, - pm_pack_length_type *length_type, uint64_t *length, pm_pack_encoding *encoding) { - +pm_pack_parse( + pm_pack_variant variant, + const char **format, + const char *format_end, + pm_pack_type *type, + pm_pack_signed *signed_type, + pm_pack_endian *endian, + pm_pack_size *size, + pm_pack_length_type *length_type, + uint64_t *length, + pm_pack_encoding *encoding +) { if (*encoding == PM_PACK_ENCODING_START) { *encoding = PM_PACK_ENCODING_US_ASCII; } @@ -479,15 +506,4 @@ pm_size_to_native(pm_pack_size size) { } } -static uintmax_t -strtoumaxc(const char **format) { - uintmax_t value = 0; - while (**format >= '0' && **format <= '9') { - if (value > UINTMAX_MAX / 10) { - errno = ERANGE; - } - value = value * 10 + ((uintmax_t) (**format - '0')); - (*format)++; - } - return value; -} +#endif diff --git a/prism/pack.h b/prism/pack.h index e49484838970a8..cfdc251fe6d589 100644 --- a/prism/pack.h +++ b/prism/pack.h @@ -6,6 +6,15 @@ #ifndef PRISM_PACK_H #define PRISM_PACK_H +// We optionally support parsing String#pack templates. For systems that don't +// want or need this functionality, it can be turned off with the +// PRISM_EXCLUDE_PACK define. +#ifdef PRISM_EXCLUDE_PACK + +void pm_pack_parse(void); + +#else + #include "prism/defines.h" #include @@ -150,3 +159,5 @@ pm_pack_parse( PRISM_EXPORTED_FUNCTION size_t pm_size_to_native(pm_pack_size size); #endif + +#endif diff --git a/prism/prettyprint.h b/prism/prettyprint.h index 351b92df39510f..ea11b4a24687b3 100644 --- a/prism/prettyprint.h +++ b/prism/prettyprint.h @@ -6,6 +6,12 @@ #ifndef PRISM_PRETTYPRINT_H #define PRISM_PRETTYPRINT_H +#ifdef PRISM_EXCLUDE_PRETTYPRINT + +void pm_prettyprint(void); + +#else + #include "prism/defines.h" #include @@ -24,3 +30,5 @@ PRISM_EXPORTED_FUNCTION void pm_prettyprint(pm_buffer_t *output_buffer, const pm_parser_t *parser, const pm_node_t *node); #endif + +#endif diff --git a/prism/prism.c b/prism/prism.c index 955636027e392d..8a6ca0eccc4e6a 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -19316,6 +19316,41 @@ pm_parse_stream(pm_parser_t *parser, pm_buffer_t *buffer, void *stream, pm_parse return node; } +/** + * Parse the source and return true if it parses without errors or warnings. + */ +PRISM_EXPORTED_FUNCTION bool +pm_parse_success_p(const uint8_t *source, size_t size, const char *data) { + pm_options_t options = { 0 }; + pm_options_read(&options, data); + + pm_parser_t parser; + pm_parser_init(&parser, source, size, &options); + + pm_node_t *node = pm_parse(&parser); + pm_node_destroy(&parser, node); + + bool result = parser.error_list.size == 0 && parser.warning_list.size == 0; + pm_parser_free(&parser); + pm_options_free(&options); + + return result; +} + +#undef PM_CASE_KEYWORD +#undef PM_CASE_OPERATOR +#undef PM_CASE_WRITABLE +#undef PM_STRING_EMPTY +#undef PM_LOCATION_NODE_BASE_VALUE +#undef PM_LOCATION_NODE_VALUE +#undef PM_LOCATION_NULL_VALUE +#undef PM_LOCATION_TOKEN_VALUE + +// We optionally support serializing to a binary string. For systems that don't +// want or need this functionality, it can be turned off with the +// PRISM_EXCLUDE_SERIALIZATION define. +#ifndef PRISM_EXCLUDE_SERIALIZATION + static inline void pm_serialize_header(pm_buffer_t *buffer) { pm_buffer_append_string(buffer, "PRISM", 5); @@ -19402,14 +19437,7 @@ pm_serialize_parse_comments(pm_buffer_t *buffer, const uint8_t *source, size_t s pm_options_free(&options); } -#undef PM_CASE_KEYWORD -#undef PM_CASE_OPERATOR -#undef PM_CASE_WRITABLE -#undef PM_STRING_EMPTY -#undef PM_LOCATION_NODE_BASE_VALUE -#undef PM_LOCATION_NODE_VALUE -#undef PM_LOCATION_NULL_VALUE -#undef PM_LOCATION_TOKEN_VALUE +#endif /** An error that is going to be formatted into the output. */ typedef struct { diff --git a/prism/prism.h b/prism/prism.h index 5e3919f40b8c94..34540b9441954a 100644 --- a/prism/prism.h +++ b/prism/prism.h @@ -98,6 +98,11 @@ typedef char * (pm_parse_stream_fgets_t)(char *string, int size, void *stream); */ PRISM_EXPORTED_FUNCTION pm_node_t * pm_parse_stream(pm_parser_t *parser, pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *fgets, const pm_options_t *options); +// We optionally support serializing to a binary string. For systems that don't +// want or need this functionality, it can be turned off with the +// PRISM_EXCLUDE_SERIALIZATION define. +#ifndef PRISM_EXCLUDE_SERIALIZATION + /** * Parse and serialize the AST represented by the source that is read out of the * given stream into to the given buffer. @@ -185,6 +190,8 @@ PRISM_EXPORTED_FUNCTION void pm_serialize_lex(pm_buffer_t *buffer, const uint8_t */ PRISM_EXPORTED_FUNCTION void pm_serialize_parse_lex(pm_buffer_t *buffer, const uint8_t *source, size_t size, const char *data); +#endif + /** * Parse the source and return true if it parses without errors or warnings. * @@ -220,6 +227,10 @@ const char * pm_token_type_human(pm_token_type_t token_type); */ PRISM_EXPORTED_FUNCTION void pm_parser_errors_format(const pm_parser_t *parser, pm_buffer_t *buffer, bool colorize); +// We optionally support dumping to JSON. For systems that don't want or need +// this functionality, it can be turned off with the PRISM_EXCLUDE_JSON define. +#ifndef PRISM_EXCLUDE_JSON + /** * Dump JSON to the given buffer. * @@ -229,6 +240,8 @@ PRISM_EXPORTED_FUNCTION void pm_parser_errors_format(const pm_parser_t *parser, */ PRISM_EXPORTED_FUNCTION void pm_dump_json(pm_buffer_t *buffer, const pm_parser_t *parser, const pm_node_t *node); +#endif + /** * @mainpage * diff --git a/prism/templates/src/node.c.erb b/prism/templates/src/node.c.erb index 699fc00725a8d0..99a0c92fa94eef 100644 --- a/prism/templates/src/node.c.erb +++ b/prism/templates/src/node.c.erb @@ -247,6 +247,10 @@ pm_visit_child_nodes(const pm_node_t *node, bool (*visitor)(const pm_node_t *nod } } +// We optionally support dumping to JSON. For systems that don't want or need +// this functionality, it can be turned off with the PRISM_EXCLUDE_JSON define. +#ifndef PRISM_EXCLUDE_JSON + static void pm_dump_json_constant(pm_buffer_t *buffer, const pm_parser_t *parser, pm_constant_id_t constant_id) { const pm_constant_t *constant = pm_constant_pool_id_to_constant(&parser->constant_pool, constant_id); @@ -360,3 +364,5 @@ pm_dump_json(pm_buffer_t *buffer, const pm_parser_t *parser, const pm_node_t *no break; } } + +#endif diff --git a/prism/templates/src/prettyprint.c.erb b/prism/templates/src/prettyprint.c.erb index ef3385d1a66b95..27f44cd996f68a 100644 --- a/prism/templates/src/prettyprint.c.erb +++ b/prism/templates/src/prettyprint.c.erb @@ -1,6 +1,15 @@ <%# encoding: ASCII -%> #include "prism/prettyprint.h" +// We optionally support pretty printing nodes. For systems that don't want or +// need this functionality, it can be turned off with the +// PRISM_EXCLUDE_PRETTYPRINT define. +#ifdef PRISM_EXCLUDE_PRETTYPRINT + +void pm_prettyprint(void) {} + +#else + static inline void prettyprint_location(pm_buffer_t *output_buffer, const pm_parser_t *parser, const pm_location_t *location) { pm_line_column_t start = pm_newline_list_line_column(&parser->newline_list, location->start, parser->start_line); @@ -154,3 +163,5 @@ pm_prettyprint(pm_buffer_t *output_buffer, const pm_parser_t *parser, const pm_n prettyprint_node(output_buffer, parser, node, &prefix_buffer); pm_buffer_free(&prefix_buffer); } + +#endif diff --git a/prism/templates/src/serialize.c.erb b/prism/templates/src/serialize.c.erb index 27fde37f698601..94b976645dd1d1 100644 --- a/prism/templates/src/serialize.c.erb +++ b/prism/templates/src/serialize.c.erb @@ -1,5 +1,10 @@ #include "prism.h" +// We optionally support serializing to a binary string. For systems that don't +// want or need this functionality, it can be turned off with the +// PRISM_EXCLUDE_SERIALIZATION define. +#ifndef PRISM_EXCLUDE_SERIALIZATION + #include static inline uint32_t @@ -394,23 +399,4 @@ pm_serialize_parse_lex(pm_buffer_t *buffer, const uint8_t *source, size_t size, pm_options_free(&options); } -/** - * Parse the source and return true if it parses without errors or warnings. - */ -PRISM_EXPORTED_FUNCTION bool -pm_parse_success_p(const uint8_t *source, size_t size, const char *data) { - pm_options_t options = { 0 }; - pm_options_read(&options, data); - - pm_parser_t parser; - pm_parser_init(&parser, source, size, &options); - - pm_node_t *node = pm_parse(&parser); - pm_node_destroy(&parser, node); - - bool result = parser.error_list.size == 0 && parser.warning_list.size == 0; - pm_parser_free(&parser); - pm_options_free(&options); - - return result; -} +#endif diff --git a/test/prism/encoding_test.rb b/test/prism/encoding_test.rb index c23bb8c294b722..649d05b874164b 100644 --- a/test/prism/encoding_test.rb +++ b/test/prism/encoding_test.rb @@ -9,10 +9,13 @@ class EncodingTest < TestCase codepoints_1byte = 0...0x100 encodings = { Encoding::ASCII_8BIT => codepoints_1byte, - Encoding::US_ASCII => codepoints_1byte, - Encoding::Windows_1253 => codepoints_1byte + Encoding::US_ASCII => codepoints_1byte } + if !ENV["PRISM_BUILD_MINIMAL"] + encodings[Encoding::Windows_1253] = codepoints_1byte + end + # By default we don't test every codepoint in these encodings because it # takes a very long time. if ENV["PRISM_TEST_ALL_ENCODINGS"] @@ -205,21 +208,6 @@ def test_emacs_style assert_equal Encoding.find("utf-8"), actual end - # This test may be a little confusing. Basically when we use our strpbrk, it - # takes into account the encoding of the file. - def test_strpbrk_multibyte - result = Prism.parse(<<~RUBY) - # encoding: Shift_JIS - %w[\x81\x5c] - RUBY - - assert(result.errors.empty?) - assert_equal( - (+"\x81\x5c").force_encoding(Encoding::Shift_JIS), - result.value.statements.body.first.elements.first.unescaped - ) - end - def test_utf_8_variations %w[ utf-8-unix @@ -238,22 +226,39 @@ def test_first_lexed_token assert_equal Encoding.find("ascii-8bit"), encoding end - def test_slice_encoding - slice = Prism.parse("# encoding: Shift_JIS\nア").value.slice - assert_equal (+"ア").force_encoding(Encoding::SHIFT_JIS), slice - assert_equal Encoding::SHIFT_JIS, slice.encoding - end + if !ENV["PRISM_BUILD_MINIMAL"] + # This test may be a little confusing. Basically when we use our strpbrk, + # it takes into account the encoding of the file. + def test_strpbrk_multibyte + result = Prism.parse(<<~RUBY) + # encoding: Shift_JIS + %w[\x81\x5c] + RUBY + + assert(result.errors.empty?) + assert_equal( + (+"\x81\x5c").force_encoding(Encoding::Shift_JIS), + result.value.statements.body.first.elements.first.unescaped + ) + end - def test_multibyte_escapes - [ - ["'", "'"], - ["\"", "\""], - ["`", "`"], - ["/", "/"], - ["<<'HERE'\n", "\nHERE"], - ["<<-HERE\n", "\nHERE"] - ].each do |opening, closing| - assert Prism.parse_success?("# encoding: shift_jis\n'\\\x82\xA0'\n") + def test_slice_encoding + slice = Prism.parse("# encoding: Shift_JIS\nア").value.slice + assert_equal (+"ア").force_encoding(Encoding::SHIFT_JIS), slice + assert_equal Encoding::SHIFT_JIS, slice.encoding + end + + def test_multibyte_escapes + [ + ["'", "'"], + ["\"", "\""], + ["`", "`"], + ["/", "/"], + ["<<'HERE'\n", "\nHERE"], + ["<<-HERE\n", "\nHERE"] + ].each do |opening, closing| + assert Prism.parse_success?("# encoding: shift_jis\n'\\\x82\xA0'\n") + end end end diff --git a/test/prism/fuzzer_test.rb b/test/prism/fuzzer_test.rb index ac112f897ac762..511210e7ee1fec 100644 --- a/test/prism/fuzzer_test.rb +++ b/test/prism/fuzzer_test.rb @@ -1,9 +1,12 @@ # frozen_string_literal: true +return if ENV["PRISM_BUILD_MINIMAL"] + require_relative "test_helper" module Prism - # These tests are simply to exercise snippets found by the fuzzer that caused invalid memory access. + # These tests are simply to exercise snippets found by the fuzzer that caused + # invalid memory access. class FuzzerTest < TestCase def self.snippet(name, source) define_method(:"test_fuzzer_#{name}") { Prism.dump(source) } diff --git a/test/prism/magic_comment_test.rb b/test/prism/magic_comment_test.rb index 5e232c2d004cd1..4c02af732cf993 100644 --- a/test/prism/magic_comment_test.rb +++ b/test/prism/magic_comment_test.rb @@ -17,11 +17,11 @@ class MagicCommentTest < TestCase "# -*- \s\t\v encoding \s\t\v : \s\t\v ascii \s\t\v -*-", "# -*- foo: bar; encoding: ascii -*-", "# coding \t \r \v : \t \v \r ascii-8bit", - "# vim: filetype=ruby, fileencoding=big5, tabsize=3, shiftwidth=3" + "# vim: filetype=ruby, fileencoding=windows-31j, tabsize=3, shiftwidth=3" ] - examples.each do |example| - define_method(:"test_magic_comment_#{example}") do + examples.each.with_index(1) do |example, index| + define_method(:"test_magic_comment_#{index}") do assert_magic_comment(example) end end diff --git a/test/prism/parse_test.rb b/test/prism/parse_test.rb index 574a1c1714f8d1..db66b431ba7ad0 100644 --- a/test/prism/parse_test.rb +++ b/test/prism/parse_test.rb @@ -75,19 +75,21 @@ def test_parse_lex assert_equal 5, tokens.length end - def test_dump_file - assert_nothing_raised do - Prism.dump_file(__FILE__) - end + if !ENV["PRISM_BUILD_MINIMAL"] + def test_dump_file + assert_nothing_raised do + Prism.dump_file(__FILE__) + end - error = assert_raise Errno::ENOENT do - Prism.dump_file("idontexist.rb") - end + error = assert_raise Errno::ENOENT do + Prism.dump_file("idontexist.rb") + end - assert_equal "No such file or directory - idontexist.rb", error.message + assert_equal "No such file or directory - idontexist.rb", error.message - assert_raise TypeError do - Prism.dump_file(nil) + assert_raise TypeError do + Prism.dump_file(nil) + end end end @@ -259,9 +261,11 @@ def test_parse_file_comments warn("Created snapshot at #{snapshot}.") end - # Next, assert that the value can be serialized and deserialized without - # changing the shape of the tree. - assert_equal_nodes(result.value, Prism.load(source, Prism.dump(source, filepath: relative)).value) + if !ENV["PRISM_BUILD_MINIMAL"] + # Next, assert that the value can be serialized and deserialized + # without changing the shape of the tree. + assert_equal_nodes(result.value, Prism.load(source, Prism.dump(source, filepath: relative)).value) + end # Next, check that the location ranges of each node in the tree are a # superset of their respective child nodes. @@ -318,7 +322,9 @@ def test_parse_file_comments result = Prism.parse(snippet, filepath: relative) assert_empty result.errors - assert_equal_nodes(result.value, Prism.load(snippet, Prism.dump(snippet, filepath: relative)).value) + if !ENV["PRISM_BUILD_MINIMAL"] + assert_equal_nodes(result.value, Prism.load(snippet, Prism.dump(snippet, filepath: relative)).value) + end end end end diff --git a/test/prism/ruby_api_test.rb b/test/prism/ruby_api_test.rb index 80f7cb05d3e66f..4153a69ad75644 100644 --- a/test/prism/ruby_api_test.rb +++ b/test/prism/ruby_api_test.rb @@ -4,20 +4,22 @@ module Prism class RubyAPITest < TestCase - def test_ruby_api - filepath = __FILE__ - source = File.read(filepath, binmode: true, external_encoding: Encoding::UTF_8) + if !ENV["PRISM_BUILD_MINIMAL"] + def test_ruby_api + filepath = __FILE__ + source = File.read(filepath, binmode: true, external_encoding: Encoding::UTF_8) - assert_equal Prism.lex(source, filepath: filepath).value, Prism.lex_file(filepath).value - assert_equal Prism.dump(source, filepath: filepath), Prism.dump_file(filepath) + assert_equal Prism.lex(source, filepath: filepath).value, Prism.lex_file(filepath).value + assert_equal Prism.dump(source, filepath: filepath), Prism.dump_file(filepath) - serialized = Prism.dump(source, filepath: filepath) - ast1 = Prism.load(source, serialized).value - ast2 = Prism.parse(source, filepath: filepath).value - ast3 = Prism.parse_file(filepath).value + serialized = Prism.dump(source, filepath: filepath) + ast1 = Prism.load(source, serialized).value + ast2 = Prism.parse(source, filepath: filepath).value + ast3 = Prism.parse_file(filepath).value - assert_equal_nodes ast1, ast2 - assert_equal_nodes ast2, ast3 + assert_equal_nodes ast1, ast2 + assert_equal_nodes ast2, ast3 + end end def test_parse_success? diff --git a/test/prism/static_inspect_test.rb b/test/prism/static_inspect_test.rb index 41301693e3b76d..8df2fd241e0eb3 100644 --- a/test/prism/static_inspect_test.rb +++ b/test/prism/static_inspect_test.rb @@ -54,7 +54,7 @@ def test_regular_expression def test_source_encoding assert_equal "#", static_inspect("__ENCODING__") - assert_equal "#", static_inspect("__ENCODING__", encoding: "Shift_JIS") + assert_equal "#", static_inspect("__ENCODING__", encoding: "Windows-31J") end def test_source_file