Skip to content

Commit 81416e1

Browse files
committed
Set next multixid's offset when creating a new multixid
With this commit, the next multixid's offset will always be set on the offsets page, by the time that a backend might try to read it, so we no longer need the waiting mechanism with the condition variable. In other words, this eliminates "corner case 2" mentioned in the comments. The waiting mechanism was broken in a few scenarios: - When nextMulti was advanced without WAL-logging the next multixid. For example, if a later multixid was already assigned and WAL-logged before the previous one was WAL-logged, and then the server crashed. In that case the next offset would never be set in the offsets SLRU, and a query trying to read it would get stuck waiting for it. Same thing could happen if pg_resetwal was used to forcibly advance nextMulti. - In hot standby mode, a deadlock could happen where one backend waits for the next multixid assignment record, but WAL replay is not advancing because of a recovery conflict with the waiting backend. The old TAP test used carefully placed injection points to exercise the old waiting code, but now that the waiting code is gone, much of the old test is no longer relevant. Rewrite the test to reproduce the IPC/MultixactCreation hang after crash recovery instead, and to verify that previously recorded multixids stay readable. Backpatch to all supported versions. In back-branches, we still need to be able to read WAL that was generated before this fix, so in the back-branches this includes a hack to initialize the next offsets page when replaying XLOG_MULTIXACT_CREATE_ID for the last multixid on a page. On 'master', bump XLOG_PAGE_MAGIC instead to indicate that the WAL is not compatible. Author: Andrey Borodin <amborodin@acm.org> Reviewed-by: Dmitry Yurichev <dsy.075@yandex.ru> Reviewed-by: Álvaro Herrera <alvherre@kurilemu.de> Reviewed-by: Kirill Reshke <reshkekirill@gmail.com> Reviewed-by: Ivan Bykov <i.bykov@modernsys.ru> Reviewed-by: Chao Li <li.evan.chao@gmail.com> Discussion: https://www.postgresql.org/message-id/172e5723-d65f-4eec-b512-14beacb326ce@yandex.ru Backpatch-through: 14
1 parent fbb4b60 commit 81416e1

File tree

1 file changed

+148
-45
lines changed

1 file changed

+148
-45
lines changed

src/backend/access/transam/multixact.c

Lines changed: 148 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,9 @@ static MemoryContext MXactContext = NULL;
337337
#define debug_elog6(a,b,c,d,e,f)
338338
#endif
339339

340+
/* hack to deal with WAL generated with older minor versions */
341+
static int pre_initialized_offsets_page = -1;
342+
340343
/* internal MultiXactId management */
341344
static void MultiXactIdSetOldestVisible(void);
342345
static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
@@ -868,13 +871,61 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
868871
int entryno;
869872
int slotno;
870873
MultiXactOffset *offptr;
871-
int i;
874+
MultiXactId next;
875+
int next_pageno;
876+
int next_entryno;
877+
MultiXactOffset *next_offptr;
872878

873879
LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE);
874880

881+
/* position of this multixid in the offsets SLRU area */
875882
pageno = MultiXactIdToOffsetPage(multi);
876883
entryno = MultiXactIdToOffsetEntry(multi);
877884

885+
/* position of the next multixid */
886+
next = multi + 1;
887+
if (next < FirstMultiXactId)
888+
next = FirstMultiXactId;
889+
next_pageno = MultiXactIdToOffsetPage(next);
890+
next_entryno = MultiXactIdToOffsetEntry(next);
891+
892+
/*
893+
* Older minor versions didn't set the next multixid's offset in this
894+
* function, and therefore didn't initialize the next page until the next
895+
* multixid was assigned. If we're replaying WAL that was generated by
896+
* such a version, the next page might not be initialized yet. Initialize
897+
* it now.
898+
*/
899+
if (InRecovery &&
900+
next_pageno != pageno &&
901+
MultiXactOffsetCtl->shared->latest_page_number == pageno)
902+
{
903+
elog(DEBUG1, "next offsets page is not initialized, initializing it now");
904+
905+
/* Create and zero the page */
906+
slotno = SimpleLruZeroPage(MultiXactOffsetCtl, next_pageno);
907+
908+
/* Make sure it's written out */
909+
SimpleLruWritePage(MultiXactOffsetCtl, slotno);
910+
Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
911+
912+
/*
913+
* Remember that we initialized the page, so that we don't zero it
914+
* again at the XLOG_MULTIXACT_ZERO_OFF_PAGE record.
915+
*/
916+
pre_initialized_offsets_page = next_pageno;
917+
}
918+
919+
/*
920+
* Set the starting offset of this multixid's members.
921+
*
922+
* In the common case, it was already be set by the previous
923+
* RecordNewMultiXact call, as this was the next multixid of the previous
924+
* multixid. But if multiple backends are generating multixids
925+
* concurrently, we might race ahead and get called before the previous
926+
* multixid.
927+
*/
928+
878929
/*
879930
* Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction"
880931
* to complain about if there's any I/O error. This is kinda bogus, but
@@ -886,9 +937,37 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
886937
offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
887938
offptr += entryno;
888939

889-
*offptr = offset;
940+
if (*offptr != offset)
941+
{
942+
/* should already be set to the correct value, or not at all */
943+
Assert(*offptr == 0);
944+
*offptr = offset;
945+
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
946+
}
890947

891-
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
948+
/*
949+
* Set the next multixid's offset to the end of this multixid's members.
950+
*/
951+
if (next_pageno == pageno)
952+
{
953+
next_offptr = offptr + 1;
954+
}
955+
else
956+
{
957+
/* must be the first entry on the page */
958+
Assert(next_entryno == 0 || next == FirstMultiXactId);
959+
slotno = SimpleLruReadPage(MultiXactOffsetCtl, next_pageno, true, next);
960+
next_offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
961+
next_offptr += next_entryno;
962+
}
963+
964+
if (*next_offptr != offset + nmembers)
965+
{
966+
/* should already be set to the correct value, or not at all */
967+
Assert(*next_offptr == 0);
968+
*next_offptr = offset + nmembers;
969+
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
970+
}
892971

893972
/* Exchange our lock */
894973
LWLockRelease(MultiXactOffsetSLRULock);
@@ -897,7 +976,7 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
897976

898977
prev_pageno = -1;
899978

900-
for (i = 0; i < nmembers; i++, offset++)
979+
for (int i = 0; i < nmembers; i++, offset++)
901980
{
902981
TransactionId *memberptr;
903982
uint32 *flagsptr;
@@ -1072,8 +1151,11 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
10721151
result = FirstMultiXactId;
10731152
}
10741153

1075-
/* Make sure there is room for the MXID in the file. */
1076-
ExtendMultiXactOffset(result);
1154+
/*
1155+
* Make sure there is room for the next MXID in the file. Assigning this
1156+
* MXID sets the next MXID's offset already.
1157+
*/
1158+
ExtendMultiXactOffset(result + 1);
10771159

10781160
/*
10791161
* Reserve the members space, similarly to above. Also, be careful not to
@@ -1314,21 +1396,14 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
13141396
* one's. However, there are some corner cases to worry about:
13151397
*
13161398
* 1. This multixact may be the latest one created, in which case there is
1317-
* no next one to look at. In this case the nextOffset value we just
1318-
* saved is the correct endpoint.
1319-
*
1320-
* 2. The next multixact may still be in process of being filled in: that
1321-
* is, another process may have done GetNewMultiXactId but not yet written
1322-
* the offset entry for that ID. In that scenario, it is guaranteed that
1323-
* the offset entry for that multixact exists (because GetNewMultiXactId
1324-
* won't release MultiXactGenLock until it does) but contains zero
1325-
* (because we are careful to pre-zero offset pages). Because
1326-
* GetNewMultiXactId will never return zero as the starting offset for a
1327-
* multixact, when we read zero as the next multixact's offset, we know we
1328-
* have this case. We sleep for a bit and try again.
1399+
* no next one to look at. The next multixact's offset should be set
1400+
* already, as we set it in RecordNewMultiXact(), but we used to not do
1401+
* that in older minor versions. To cope with that case, if this
1402+
* multixact is the latest one created, use the nextOffset value we read
1403+
* above as the endpoint.
13291404
*
1330-
* 3. Because GetNewMultiXactId increments offset zero to offset one to
1331-
* handle case #2, there is an ambiguity near the point of offset
1405+
* 2. Because GetNewMultiXactId skips over offset zero, to reserve zero
1406+
* for to mean "unset", there is an ambiguity near the point of offset
13321407
* wraparound. If we see next multixact's offset is one, is that our
13331408
* multixact's actual endpoint, or did it end at zero with a subsequent
13341409
* increment? We handle this using the knowledge that if the zero'th
@@ -1340,7 +1415,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
13401415
* cases, so it seems better than holding the MultiXactGenLock for a long
13411416
* time on every multixact creation.
13421417
*/
1343-
retry:
13441418
LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE);
13451419

13461420
pageno = MultiXactIdToOffsetPage(multi);
@@ -1385,13 +1459,10 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
13851459
nextMXOffset = *offptr;
13861460

13871461
if (nextMXOffset == 0)
1388-
{
1389-
/* Corner case 2: next multixact is still being filled in */
1390-
LWLockRelease(MultiXactOffsetSLRULock);
1391-
CHECK_FOR_INTERRUPTS();
1392-
pg_usleep(1000L);
1393-
goto retry;
1394-
}
1462+
ereport(ERROR,
1463+
(errcode(ERRCODE_DATA_CORRUPTED),
1464+
errmsg("MultiXact %u has invalid next offset",
1465+
multi)));
13951466

13961467
length = nextMXOffset - offset;
13971468
}
@@ -1427,7 +1498,7 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
14271498

14281499
if (!TransactionIdIsValid(*xactptr))
14291500
{
1430-
/* Corner case 3: we must be looking at unused slot zero */
1501+
/* Corner case 2: we must be looking at unused slot zero */
14311502
Assert(offset == 0);
14321503
continue;
14331504
}
@@ -2056,24 +2127,32 @@ TrimMultiXact(void)
20562127
MultiXactOffsetCtl->shared->latest_page_number = pageno;
20572128

20582129
/*
2059-
* Zero out the remainder of the current offsets page. See notes in
2060-
* TrimCLOG() for background. Unlike CLOG, some WAL record covers every
2061-
* pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL
2062-
* rule "write xlog before data," nextMXact successors may carry obsolete,
2063-
* nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers()
2064-
* operates normally.
2130+
* Set the offset of nextMXact on the offsets page. This is normally done
2131+
* in RecordNewMultiXact() of the previous multixact, but we used to not
2132+
* do that in older minor versions. To ensure that the next offset is set
2133+
* if the binary was just upgraded from an older minor version, do it now.
2134+
*
2135+
* Zero out the remainder of the page. See notes in TrimCLOG() for
2136+
* background. Unlike CLOG, some WAL record covers every pg_multixact
2137+
* SLRU mutation. Since, also unlike CLOG, we ignore the WAL rule "write
2138+
* xlog before data," nextMXact successors may carry obsolete, nonzero
2139+
* offset values.
20652140
*/
20662141
entryno = MultiXactIdToOffsetEntry(nextMXact);
2067-
if (entryno != 0)
20682142
{
20692143
int slotno;
20702144
MultiXactOffset *offptr;
20712145

2072-
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
2146+
if (entryno == 0)
2147+
slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
2148+
else
2149+
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
20732150
offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
20742151
offptr += entryno;
20752152

2076-
MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset)));
2153+
*offptr = offset;
2154+
if (entryno != 0 && (entryno + 1) * sizeof(MultiXactOffset) != BLCKSZ)
2155+
MemSet(offptr + 1, 0, BLCKSZ - (entryno + 1) * sizeof(MultiXactOffset));
20772156

20782157
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
20792158
}
@@ -3255,13 +3334,21 @@ multixact_redo(XLogReaderState *record)
32553334

32563335
memcpy(&pageno, XLogRecGetData(record), sizeof(int));
32573336

3258-
LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE);
3259-
3260-
slotno = ZeroMultiXactOffsetPage(pageno, false);
3261-
SimpleLruWritePage(MultiXactOffsetCtl, slotno);
3262-
Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
3263-
3264-
LWLockRelease(MultiXactOffsetSLRULock);
3337+
/*
3338+
* Skip the record if we already initialized the page at the previous
3339+
* XLOG_MULTIXACT_CREATE_ID record. See RecordNewMultiXact().
3340+
*/
3341+
if (pre_initialized_offsets_page != pageno)
3342+
{
3343+
LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE);
3344+
slotno = ZeroMultiXactOffsetPage(pageno, false);
3345+
SimpleLruWritePage(MultiXactOffsetCtl, slotno);
3346+
Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
3347+
LWLockRelease(MultiXactOffsetSLRULock);
3348+
}
3349+
else
3350+
elog(DEBUG1, "skipping initialization of offsets page %d because it was already initialized on multixid creation", pageno);
3351+
pre_initialized_offsets_page = -1;
32653352
}
32663353
else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE)
32673354
{
@@ -3285,6 +3372,22 @@ multixact_redo(XLogReaderState *record)
32853372
TransactionId max_xid;
32863373
int i;
32873374

3375+
if (pre_initialized_offsets_page != -1)
3376+
{
3377+
/*
3378+
* If we implicitly initialized the next offsets page while
3379+
* replaying an XLOG_MULTIXACT_CREATE_ID record that was generated
3380+
* with an older minor version, we still expect to see an
3381+
* XLOG_MULTIXACT_ZERO_OFF_PAGE record for it before any other
3382+
* XLOG_MULTIXACT_CREATE_ID records. Therefore this case should
3383+
* not happen. If it does, we'll continue with the replay, but
3384+
* log a message to note that something's funny.
3385+
*/
3386+
elog(LOG, "expected to see an XLOG_MULTIXACT_ZERO_OFF_PAGE record for page %d that was implicitly initialized earlier",
3387+
pre_initialized_offsets_page);
3388+
pre_initialized_offsets_page = -1;
3389+
}
3390+
32883391
/* Store the data back into the SLRU files */
32893392
RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers,
32903393
xlrec->members);

0 commit comments

Comments
 (0)