Fail hard on out-of-memory failures in xlogreader.c

This commit changes the WAL reader routines so as a FATAL for the backend or exit(FAILURE) for the frontend is triggered if an allocation for a WAL record decode fails in walreader.c, rather than treating this case as bogus data, which would be equivalent to the end of WAL. The key is to avoid palloc_extended(MCXT_ALLOC_NO_OOM) in walreader.c, relying on plain palloc() calls. The previous behavior could make WAL replay finish too early than it should. For example, crash recovery finishing earlier may corrupt clusters because not all the WAL available locally was replayed to ensure a consistent state. Out-of-memory failures would show up randomly depending on the memory pressure on the host, but one simple case would be to generate a large record, then replay this record after downsizing a host, as Ethan Mertz originally reported. This relies on bae868c, as the WAL reader routines now do the memory allocation required for a record only once its header has been fully read and validated, making xl_tot_len trustable. Making the WAL reader react differently on out-of-memory or bogus record data would require ABI changes, so this is the safest choice for stable branches. Also, it is worth noting that 3f1ce97 has been using a plain palloc() in this code for some time now. Thanks to Noah Misch and Thomas Munro for the discussion. Like the other commit, backpatch down to 12, leaving out v11 that will be EOL'd soon. The behavior of considering a failed allocation as bogus data comes originally from 0ffe11a, where the record length retrieved from its header was not entirely trustable. Reported-by: Ethan Mertz Discussion: https://postgr.es/m/ZRKKdI5-RRlta3aF@paquier.xyz Backpatch-through: 12
postgres · Oct 3, 2023 · baeb854 · baeb854
1 parent 829d91c
commit baeb854
Showing 1 changed file with 5 additions and 26 deletions.
diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c
@@ -35,7 +35,7 @@
 
 static void report_invalid_record(XLogReaderState *state, const char *fmt,...)
 			pg_attribute_printf(2, 3);
-static bool allocate_recordbuf(XLogReaderState *state, uint32 reclength);
+static void allocate_recordbuf(XLogReaderState *state, uint32 reclength);
 static int	ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr,
 							 int reqLen);
 static void XLogReaderInvalReadState(XLogReaderState *state);
@@ -124,14 +124,7 @@ XLogReaderAllocate(int wal_segment_size, const char *waldir,
 	 * Allocate an initial readRecordBuf of minimal size, which can later be
 	 * enlarged if necessary.
 	 */
-	if (!allocate_recordbuf(state, 0))
-	{
-		pfree(state->errormsg_buf);
-		pfree(state->readBuf);
-		pfree(state);
-		return NULL;
-	}
-
+	allocate_recordbuf(state, 0);
 	return state;
 }
 
@@ -160,7 +153,6 @@ XLogReaderFree(XLogReaderState *state)
 
 /*
  * Allocate readRecordBuf to fit a record of at least the given length.
- * Returns true if successful, false if out of memory.
  *
  * readRecordBufSize is set to the new buffer size.
  *
@@ -172,7 +164,7 @@ XLogReaderFree(XLogReaderState *state)
  * Note: This routine should *never* be called for xl_tot_len until the header
  * of the record has been fully validated.
  */
-static bool
+static void
 allocate_recordbuf(XLogReaderState *state, uint32 reclength)
 {
 	uint32		newSize = reclength;
@@ -182,15 +174,8 @@ allocate_recordbuf(XLogReaderState *state, uint32 reclength)
 
 	if (state->readRecordBuf)
 		pfree(state->readRecordBuf);
-	state->readRecordBuf =
-		(char *) palloc_extended(newSize, MCXT_ALLOC_NO_OOM);
-	if (state->readRecordBuf == NULL)
-	{
-		state->readRecordBufSize = 0;
-		return false;
-	}
+	state->readRecordBuf = (char *) palloc(newSize);
 	state->readRecordBufSize = newSize;
-	return true;
 }
 
 /*
@@ -524,13 +509,7 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
 				Assert(gotlen <= lengthof(save_copy));
 				Assert(gotlen <= state->readRecordBufSize);
 				memcpy(save_copy, state->readRecordBuf, gotlen);
-				if (!allocate_recordbuf(state, total_len))
-				{
-					/* We treat this as a "bogus data" condition */
-					report_invalid_record(state, "record length %u at %X/%X too long",
-										  total_len, LSN_FORMAT_ARGS(RecPtr));
-					goto err;
-				}
+				allocate_recordbuf(state, total_len);
 				memcpy(state->readRecordBuf, save_copy, gotlen);
 				buffer = state->readRecordBuf + gotlen;
 			}