@@ -337,6 +337,9 @@ static MemoryContext MXactContext = NULL;
337337#define debug_elog6 (a ,b ,c ,d ,e ,f )
338338#endif
339339
340+ /* hack to deal with WAL generated with older minor versions */
341+ static int pre_initialized_offsets_page = -1 ;
342+
340343/* internal MultiXactId management */
341344static void MultiXactIdSetOldestVisible (void );
342345static void RecordNewMultiXact (MultiXactId multi , MultiXactOffset offset ,
@@ -868,13 +871,61 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
868871 int entryno ;
869872 int slotno ;
870873 MultiXactOffset * offptr ;
871- int i ;
874+ MultiXactId next ;
875+ int next_pageno ;
876+ int next_entryno ;
877+ MultiXactOffset * next_offptr ;
872878
873879 LWLockAcquire (MultiXactOffsetSLRULock , LW_EXCLUSIVE );
874880
881+ /* position of this multixid in the offsets SLRU area */
875882 pageno = MultiXactIdToOffsetPage (multi );
876883 entryno = MultiXactIdToOffsetEntry (multi );
877884
885+ /* position of the next multixid */
886+ next = multi + 1 ;
887+ if (next < FirstMultiXactId )
888+ next = FirstMultiXactId ;
889+ next_pageno = MultiXactIdToOffsetPage (next );
890+ next_entryno = MultiXactIdToOffsetEntry (next );
891+
892+ /*
893+ * Older minor versions didn't set the next multixid's offset in this
894+ * function, and therefore didn't initialize the next page until the next
895+ * multixid was assigned. If we're replaying WAL that was generated by
896+ * such a version, the next page might not be initialized yet. Initialize
897+ * it now.
898+ */
899+ if (InRecovery &&
900+ next_pageno != pageno &&
901+ MultiXactOffsetCtl -> shared -> latest_page_number == pageno )
902+ {
903+ elog (DEBUG1 , "next offsets page is not initialized, initializing it now" );
904+
905+ /* Create and zero the page */
906+ slotno = SimpleLruZeroPage (MultiXactOffsetCtl , next_pageno );
907+
908+ /* Make sure it's written out */
909+ SimpleLruWritePage (MultiXactOffsetCtl , slotno );
910+ Assert (!MultiXactOffsetCtl -> shared -> page_dirty [slotno ]);
911+
912+ /*
913+ * Remember that we initialized the page, so that we don't zero it
914+ * again at the XLOG_MULTIXACT_ZERO_OFF_PAGE record.
915+ */
916+ pre_initialized_offsets_page = next_pageno ;
917+ }
918+
919+ /*
920+ * Set the starting offset of this multixid's members.
921+ *
922+ * In the common case, it was already be set by the previous
923+ * RecordNewMultiXact call, as this was the next multixid of the previous
924+ * multixid. But if multiple backends are generating multixids
925+ * concurrently, we might race ahead and get called before the previous
926+ * multixid.
927+ */
928+
878929 /*
879930 * Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction"
880931 * to complain about if there's any I/O error. This is kinda bogus, but
@@ -886,9 +937,37 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
886937 offptr = (MultiXactOffset * ) MultiXactOffsetCtl -> shared -> page_buffer [slotno ];
887938 offptr += entryno ;
888939
889- * offptr = offset ;
940+ if (* offptr != offset )
941+ {
942+ /* should already be set to the correct value, or not at all */
943+ Assert (* offptr == 0 );
944+ * offptr = offset ;
945+ MultiXactOffsetCtl -> shared -> page_dirty [slotno ] = true;
946+ }
890947
891- MultiXactOffsetCtl -> shared -> page_dirty [slotno ] = true;
948+ /*
949+ * Set the next multixid's offset to the end of this multixid's members.
950+ */
951+ if (next_pageno == pageno )
952+ {
953+ next_offptr = offptr + 1 ;
954+ }
955+ else
956+ {
957+ /* must be the first entry on the page */
958+ Assert (next_entryno == 0 || next == FirstMultiXactId );
959+ slotno = SimpleLruReadPage (MultiXactOffsetCtl , next_pageno , true, next );
960+ next_offptr = (MultiXactOffset * ) MultiXactOffsetCtl -> shared -> page_buffer [slotno ];
961+ next_offptr += next_entryno ;
962+ }
963+
964+ if (* next_offptr != offset + nmembers )
965+ {
966+ /* should already be set to the correct value, or not at all */
967+ Assert (* next_offptr == 0 );
968+ * next_offptr = offset + nmembers ;
969+ MultiXactOffsetCtl -> shared -> page_dirty [slotno ] = true;
970+ }
892971
893972 /* Exchange our lock */
894973 LWLockRelease (MultiXactOffsetSLRULock );
@@ -897,7 +976,7 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
897976
898977 prev_pageno = -1 ;
899978
900- for (i = 0 ; i < nmembers ; i ++ , offset ++ )
979+ for (int i = 0 ; i < nmembers ; i ++ , offset ++ )
901980 {
902981 TransactionId * memberptr ;
903982 uint32 * flagsptr ;
@@ -1072,8 +1151,11 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
10721151 result = FirstMultiXactId ;
10731152 }
10741153
1075- /* Make sure there is room for the MXID in the file. */
1076- ExtendMultiXactOffset (result );
1154+ /*
1155+ * Make sure there is room for the next MXID in the file. Assigning this
1156+ * MXID sets the next MXID's offset already.
1157+ */
1158+ ExtendMultiXactOffset (result + 1 );
10771159
10781160 /*
10791161 * Reserve the members space, similarly to above. Also, be careful not to
@@ -1314,21 +1396,14 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
13141396 * one's. However, there are some corner cases to worry about:
13151397 *
13161398 * 1. This multixact may be the latest one created, in which case there is
1317- * no next one to look at. In this case the nextOffset value we just
1318- * saved is the correct endpoint.
1319- *
1320- * 2. The next multixact may still be in process of being filled in: that
1321- * is, another process may have done GetNewMultiXactId but not yet written
1322- * the offset entry for that ID. In that scenario, it is guaranteed that
1323- * the offset entry for that multixact exists (because GetNewMultiXactId
1324- * won't release MultiXactGenLock until it does) but contains zero
1325- * (because we are careful to pre-zero offset pages). Because
1326- * GetNewMultiXactId will never return zero as the starting offset for a
1327- * multixact, when we read zero as the next multixact's offset, we know we
1328- * have this case. We sleep for a bit and try again.
1399+ * no next one to look at. The next multixact's offset should be set
1400+ * already, as we set it in RecordNewMultiXact(), but we used to not do
1401+ * that in older minor versions. To cope with that case, if this
1402+ * multixact is the latest one created, use the nextOffset value we read
1403+ * above as the endpoint.
13291404 *
1330- * 3 . Because GetNewMultiXactId increments offset zero to offset one to
1331- * handle case #2 , there is an ambiguity near the point of offset
1405+ * 2 . Because GetNewMultiXactId skips over offset zero, to reserve zero
1406+ * for to mean "unset" , there is an ambiguity near the point of offset
13321407 * wraparound. If we see next multixact's offset is one, is that our
13331408 * multixact's actual endpoint, or did it end at zero with a subsequent
13341409 * increment? We handle this using the knowledge that if the zero'th
@@ -1340,7 +1415,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
13401415 * cases, so it seems better than holding the MultiXactGenLock for a long
13411416 * time on every multixact creation.
13421417 */
1343- retry :
13441418 LWLockAcquire (MultiXactOffsetSLRULock , LW_EXCLUSIVE );
13451419
13461420 pageno = MultiXactIdToOffsetPage (multi );
@@ -1385,13 +1459,10 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
13851459 nextMXOffset = * offptr ;
13861460
13871461 if (nextMXOffset == 0 )
1388- {
1389- /* Corner case 2: next multixact is still being filled in */
1390- LWLockRelease (MultiXactOffsetSLRULock );
1391- CHECK_FOR_INTERRUPTS ();
1392- pg_usleep (1000L );
1393- goto retry ;
1394- }
1462+ ereport (ERROR ,
1463+ (errcode (ERRCODE_DATA_CORRUPTED ),
1464+ errmsg ("MultiXact %u has invalid next offset" ,
1465+ multi )));
13951466
13961467 length = nextMXOffset - offset ;
13971468 }
@@ -1427,7 +1498,7 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
14271498
14281499 if (!TransactionIdIsValid (* xactptr ))
14291500 {
1430- /* Corner case 3 : we must be looking at unused slot zero */
1501+ /* Corner case 2 : we must be looking at unused slot zero */
14311502 Assert (offset == 0 );
14321503 continue ;
14331504 }
@@ -2056,24 +2127,32 @@ TrimMultiXact(void)
20562127 MultiXactOffsetCtl -> shared -> latest_page_number = pageno ;
20572128
20582129 /*
2059- * Zero out the remainder of the current offsets page. See notes in
2060- * TrimCLOG() for background. Unlike CLOG, some WAL record covers every
2061- * pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL
2062- * rule "write xlog before data," nextMXact successors may carry obsolete,
2063- * nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers()
2064- * operates normally.
2130+ * Set the offset of nextMXact on the offsets page. This is normally done
2131+ * in RecordNewMultiXact() of the previous multixact, but we used to not
2132+ * do that in older minor versions. To ensure that the next offset is set
2133+ * if the binary was just upgraded from an older minor version, do it now.
2134+ *
2135+ * Zero out the remainder of the page. See notes in TrimCLOG() for
2136+ * background. Unlike CLOG, some WAL record covers every pg_multixact
2137+ * SLRU mutation. Since, also unlike CLOG, we ignore the WAL rule "write
2138+ * xlog before data," nextMXact successors may carry obsolete, nonzero
2139+ * offset values.
20652140 */
20662141 entryno = MultiXactIdToOffsetEntry (nextMXact );
2067- if (entryno != 0 )
20682142 {
20692143 int slotno ;
20702144 MultiXactOffset * offptr ;
20712145
2072- slotno = SimpleLruReadPage (MultiXactOffsetCtl , pageno , true, nextMXact );
2146+ if (entryno == 0 )
2147+ slotno = SimpleLruZeroPage (MultiXactOffsetCtl , pageno );
2148+ else
2149+ slotno = SimpleLruReadPage (MultiXactOffsetCtl , pageno , true, nextMXact );
20732150 offptr = (MultiXactOffset * ) MultiXactOffsetCtl -> shared -> page_buffer [slotno ];
20742151 offptr += entryno ;
20752152
2076- MemSet (offptr , 0 , BLCKSZ - (entryno * sizeof (MultiXactOffset )));
2153+ * offptr = offset ;
2154+ if (entryno != 0 && (entryno + 1 ) * sizeof (MultiXactOffset ) != BLCKSZ )
2155+ MemSet (offptr + 1 , 0 , BLCKSZ - (entryno + 1 ) * sizeof (MultiXactOffset ));
20772156
20782157 MultiXactOffsetCtl -> shared -> page_dirty [slotno ] = true;
20792158 }
@@ -3255,13 +3334,21 @@ multixact_redo(XLogReaderState *record)
32553334
32563335 memcpy (& pageno , XLogRecGetData (record ), sizeof (int ));
32573336
3258- LWLockAcquire (MultiXactOffsetSLRULock , LW_EXCLUSIVE );
3259-
3260- slotno = ZeroMultiXactOffsetPage (pageno , false);
3261- SimpleLruWritePage (MultiXactOffsetCtl , slotno );
3262- Assert (!MultiXactOffsetCtl -> shared -> page_dirty [slotno ]);
3263-
3264- LWLockRelease (MultiXactOffsetSLRULock );
3337+ /*
3338+ * Skip the record if we already initialized the page at the previous
3339+ * XLOG_MULTIXACT_CREATE_ID record. See RecordNewMultiXact().
3340+ */
3341+ if (pre_initialized_offsets_page != pageno )
3342+ {
3343+ LWLockAcquire (MultiXactOffsetSLRULock , LW_EXCLUSIVE );
3344+ slotno = ZeroMultiXactOffsetPage (pageno , false);
3345+ SimpleLruWritePage (MultiXactOffsetCtl , slotno );
3346+ Assert (!MultiXactOffsetCtl -> shared -> page_dirty [slotno ]);
3347+ LWLockRelease (MultiXactOffsetSLRULock );
3348+ }
3349+ else
3350+ elog (DEBUG1 , "skipping initialization of offsets page %d because it was already initialized on multixid creation" , pageno );
3351+ pre_initialized_offsets_page = -1 ;
32653352 }
32663353 else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE )
32673354 {
@@ -3285,6 +3372,22 @@ multixact_redo(XLogReaderState *record)
32853372 TransactionId max_xid ;
32863373 int i ;
32873374
3375+ if (pre_initialized_offsets_page != -1 )
3376+ {
3377+ /*
3378+ * If we implicitly initialized the next offsets page while
3379+ * replaying an XLOG_MULTIXACT_CREATE_ID record that was generated
3380+ * with an older minor version, we still expect to see an
3381+ * XLOG_MULTIXACT_ZERO_OFF_PAGE record for it before any other
3382+ * XLOG_MULTIXACT_CREATE_ID records. Therefore this case should
3383+ * not happen. If it does, we'll continue with the replay, but
3384+ * log a message to note that something's funny.
3385+ */
3386+ elog (LOG , "expected to see an XLOG_MULTIXACT_ZERO_OFF_PAGE record for page %d that was implicitly initialized earlier" ,
3387+ pre_initialized_offsets_page );
3388+ pre_initialized_offsets_page = -1 ;
3389+ }
3390+
32883391 /* Store the data back into the SLRU files */
32893392 RecordNewMultiXact (xlrec -> mid , xlrec -> moff , xlrec -> nmembers ,
32903393 xlrec -> members );
0 commit comments