Skip to content

Commit 968b999

Browse files
committed
[PRISM] Fix ASAN reading off end of strpbrk cache
1 parent 5026acf commit 968b999

File tree

3 files changed

+22
-11
lines changed

3 files changed

+22
-11
lines changed

prism/parser.h

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,13 @@ typedef struct {
107107
* that the lexer is now in the PM_LEX_STRING mode, and will return tokens that
108108
* are found as part of a string.
109109
*/
110+
/**
111+
* The size of the breakpoints and strpbrk cache charset buffers. All
112+
* breakpoint arrays and the strpbrk cache charset must share this size so
113+
* that memcmp can safely compare the full buffer without overreading.
114+
*/
115+
#define PM_STRPBRK_CACHE_SIZE 16
116+
110117
typedef struct pm_lex_mode {
111118
/** The type of this lex mode. */
112119
enum {
@@ -169,7 +176,7 @@ typedef struct pm_lex_mode {
169176
* This is the character set that should be used to delimit the
170177
* tokens within the list.
171178
*/
172-
uint8_t breakpoints[11];
179+
uint8_t breakpoints[PM_STRPBRK_CACHE_SIZE];
173180
} list;
174181

175182
struct {
@@ -191,7 +198,7 @@ typedef struct pm_lex_mode {
191198
* This is the character set that should be used to delimit the
192199
* tokens within the regular expression.
193200
*/
194-
uint8_t breakpoints[7];
201+
uint8_t breakpoints[PM_STRPBRK_CACHE_SIZE];
195202
} regexp;
196203

197204
struct {
@@ -224,7 +231,7 @@ typedef struct pm_lex_mode {
224231
* This is the character set that should be used to delimit the
225232
* tokens within the string.
226233
*/
227-
uint8_t breakpoints[7];
234+
uint8_t breakpoints[PM_STRPBRK_CACHE_SIZE];
228235
} string;
229236

230237
struct {
@@ -970,8 +977,8 @@ struct pm_parser {
970977
* (which is the common case during string/regex/list lexing).
971978
*/
972979
struct {
973-
/** The cached charset (null-terminated, max 11 chars + NUL). */
974-
uint8_t charset[12];
980+
/** The cached charset (null-terminated, NUL-padded). */
981+
uint8_t charset[PM_STRPBRK_CACHE_SIZE];
975982

976983
/** Nibble-based low lookup table for SIMD matching. */
977984
uint8_t low_lut[16];

prism/prism.c

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,8 @@ lex_mode_push_list(pm_parser_t *parser, bool interpolation, uint8_t delimiter) {
149149
// These are the places where we need to split up the content of the list.
150150
// We'll use strpbrk to find the first of these characters.
151151
uint8_t *breakpoints = lex_mode.as.list.breakpoints;
152-
memcpy(breakpoints, "\\ \t\f\r\v\n\0\0\0", sizeof(lex_mode.as.list.breakpoints));
152+
memset(breakpoints, 0, PM_STRPBRK_CACHE_SIZE);
153+
memcpy(breakpoints, "\\ \t\f\r\v\n", sizeof("\\ \t\f\r\v\n") - 1);
153154
size_t index = 7;
154155

155156
// Now we'll add the terminator to the list of breakpoints. If the
@@ -201,7 +202,8 @@ lex_mode_push_regexp(pm_parser_t *parser, uint8_t incrementor, uint8_t terminato
201202
// regular expression. We'll use strpbrk to find the first of these
202203
// characters.
203204
uint8_t *breakpoints = lex_mode.as.regexp.breakpoints;
204-
memcpy(breakpoints, "\r\n\\#\0\0", sizeof(lex_mode.as.regexp.breakpoints));
205+
memset(breakpoints, 0, PM_STRPBRK_CACHE_SIZE);
206+
memcpy(breakpoints, "\r\n\\#", sizeof("\r\n\\#") - 1);
205207
size_t index = 4;
206208

207209
// First we'll add the terminator.
@@ -237,7 +239,8 @@ lex_mode_push_string(pm_parser_t *parser, bool interpolation, bool label_allowed
237239
// These are the places where we need to split up the content of the
238240
// string. We'll use strpbrk to find the first of these characters.
239241
uint8_t *breakpoints = lex_mode.as.string.breakpoints;
240-
memcpy(breakpoints, "\r\n\\\0\0\0", sizeof(lex_mode.as.string.breakpoints));
242+
memset(breakpoints, 0, PM_STRPBRK_CACHE_SIZE);
243+
memcpy(breakpoints, "\r\n\\", sizeof("\r\n\\") - 1);
241244
size_t index = 3;
242245

243246
// Now add in the terminator. If the terminator is not already a NULL byte,
@@ -12054,7 +12057,7 @@ parser_lex(pm_parser_t *parser) {
1205412057
// Otherwise we'll be parsing string content. These are the places
1205512058
// where we need to split up the content of the heredoc. We'll use
1205612059
// strpbrk to find the first of these characters.
12057-
uint8_t breakpoints[] = "\r\n\\#";
12060+
uint8_t breakpoints[PM_STRPBRK_CACHE_SIZE] = "\r\n\\#";
1205812061

1205912062
pm_heredoc_quote_t quote = heredoc_lex_mode->quote;
1206012063
if (quote == PM_HEREDOC_QUOTE_SINGLE) {

prism/util/pm_strpbrk.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,9 @@ pm_strpbrk_explicit_encoding_set(pm_parser_t *parser, uint32_t start, uint32_t l
5959
*/
6060
static inline void
6161
pm_strpbrk_cache_update(pm_parser_t *parser, const uint8_t *charset) {
62-
// The cache key is the full 12-byte charset buffer. Since it is always
63-
// NUL-padded, a fixed-size comparison covers both content and length.
62+
// The cache key is the full charset buffer (PM_STRPBRK_CACHE_SIZE bytes).
63+
// Since it is always NUL-padded, a fixed-size comparison covers both
64+
// content and length.
6465
if (memcmp(parser->strpbrk_cache.charset, charset, sizeof(parser->strpbrk_cache.charset)) == 0) return;
6566

6667
memset(parser->strpbrk_cache.low_lut, 0, sizeof(parser->strpbrk_cache.low_lut));

0 commit comments

Comments
 (0)