4747#include "storage/predicate.h"
4848#include "storage/procarray.h"
4949#include "utils/datum.h"
50- #include "utils/injection_point.h"
5150#include "utils/inval.h"
5251#include "utils/spccache.h"
5352
@@ -6023,61 +6022,167 @@ heap_abort_speculative(Relation relation, ItemPointer tid)
60236022}
60246023
60256024/*
6026- * heap_inplace_update - update a tuple "in place" (ie, overwrite it)
6027- *
6028- * Overwriting violates both MVCC and transactional safety, so the uses
6029- * of this function in Postgres are extremely limited. Nonetheless we
6030- * find some places to use it.
6031- *
6032- * The tuple cannot change size, and therefore it's reasonable to assume
6033- * that its null bitmap (if any) doesn't change either. So we just
6034- * overwrite the data portion of the tuple without touching the null
6035- * bitmap or any of the header fields.
6036- *
6037- * tuple is an in-memory tuple structure containing the data to be written
6038- * over the target tuple. Also, tuple->t_self identifies the target tuple.
6039- *
6040- * Note that the tuple updated here had better not come directly from the
6041- * syscache if the relation has a toast relation as this tuple could
6042- * include toast values that have been expanded, causing a failure here.
6025+ * heap_inplace_lock - protect inplace update from concurrent heap_update()
6026+ *
6027+ * Evaluate whether the tuple's state is compatible with a no-key update.
6028+ * Current transaction rowmarks are fine, as is KEY SHARE from any
6029+ * transaction. If compatible, return true with the buffer exclusive-locked,
6030+ * and the caller must release that by calling
6031+ * heap_inplace_update_and_unlock(), calling heap_inplace_unlock(), or raising
6032+ * an error. Otherwise, return false after blocking transactions, if any,
6033+ * have ended.
6034+ *
6035+ * Since this is intended for system catalogs and SERIALIZABLE doesn't cover
6036+ * DDL, this doesn't guarantee any particular predicate locking.
6037+ *
6038+ * One could modify this to return true for tuples with delete in progress,
6039+ * All inplace updaters take a lock that conflicts with DROP. If explicit
6040+ * "DELETE FROM pg_class" is in progress, we'll wait for it like we would an
6041+ * update.
6042+ *
6043+ * Readers of inplace-updated fields expect changes to those fields are
6044+ * durable. For example, vac_truncate_clog() reads datfrozenxid from
6045+ * pg_database tuples via catalog snapshots. A future snapshot must not
6046+ * return a lower datfrozenxid for the same database OID (lower in the
6047+ * FullTransactionIdPrecedes() sense). We achieve that since no update of a
6048+ * tuple can start while we hold a lock on its buffer. In cases like
6049+ * BEGIN;GRANT;CREATE INDEX;COMMIT we're inplace-updating a tuple visible only
6050+ * to this transaction. ROLLBACK then is one case where it's okay to lose
6051+ * inplace updates. (Restoring relhasindex=false on ROLLBACK is fine, since
6052+ * any concurrent CREATE INDEX would have blocked, then inplace-updated the
6053+ * committed tuple.)
6054+ *
6055+ * In principle, we could avoid waiting by overwriting every tuple in the
6056+ * updated tuple chain. Reader expectations permit updating a tuple only if
6057+ * it's aborted, is the tail of the chain, or we already updated the tuple
6058+ * referenced in its t_ctid. Hence, we would need to overwrite the tuples in
6059+ * order from tail to head. That would imply either (a) mutating all tuples
6060+ * in one critical section or (b) accepting a chance of partial completion.
6061+ * Partial completion of a relfrozenxid update would have the weird
6062+ * consequence that the table's next VACUUM could see the table's relfrozenxid
6063+ * move forward between vacuum_get_cutoffs() and finishing.
60436064 */
6044- void
6045- heap_inplace_update (Relation relation , HeapTuple tuple )
6065+ bool
6066+ heap_inplace_lock (Relation relation ,
6067+ HeapTuple oldtup_ptr , Buffer buffer )
60466068{
6047- Buffer buffer ;
6048- Page page ;
6049- OffsetNumber offnum ;
6050- ItemId lp = NULL ;
6051- HeapTupleHeader htup ;
6052- uint32 oldlen ;
6053- uint32 newlen ;
6069+ HeapTupleData oldtup = * oldtup_ptr ; /* minimize diff vs. heap_update() */
6070+ TM_Result result ;
6071+ bool ret ;
60546072
6055- /*
6056- * For now, we don't allow parallel updates. Unlike a regular update,
6057- * this should never create a combo CID, so it might be possible to relax
6058- * this restriction, but not without more thought and testing. It's not
6059- * clear that it would be useful, anyway.
6073+ Assert (BufferIsValid (buffer ));
6074+
6075+ LockBuffer (buffer , BUFFER_LOCK_EXCLUSIVE );
6076+
6077+ /*----------
6078+ * Interpret HeapTupleSatisfiesUpdate() like heap_update() does, except:
6079+ *
6080+ * - wait unconditionally
6081+ * - no tuple locks
6082+ * - don't recheck header after wait: simpler to defer to next iteration
6083+ * - don't try to continue even if the updater aborts: likewise
6084+ * - no crosscheck
60606085 */
6061- if (IsInParallelMode ())
6086+ result = HeapTupleSatisfiesUpdate (& oldtup , GetCurrentCommandId (false),
6087+ buffer );
6088+
6089+ if (result == TM_Invisible )
6090+ {
6091+ /* no known way this can happen */
60626092 ereport (ERROR ,
6063- (errcode (ERRCODE_INVALID_TRANSACTION_STATE ),
6064- errmsg ("cannot update tuples during a parallel operation" )));
6093+ (errcode (ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE ),
6094+ errmsg_internal ("attempted to overwrite invisible tuple" )));
6095+ }
6096+ else if (result == TM_SelfModified )
6097+ {
6098+ /*
6099+ * CREATE INDEX might reach this if an expression is silly enough to
6100+ * call e.g. SELECT ... FROM pg_class FOR SHARE. C code of other SQL
6101+ * statements might get here after a heap_update() of the same row, in
6102+ * the absence of an intervening CommandCounterIncrement().
6103+ */
6104+ ereport (ERROR ,
6105+ (errcode (ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE ),
6106+ errmsg ("tuple to be updated was already modified by an operation triggered by the current command" )));
6107+ }
6108+ else if (result == TM_BeingModified )
6109+ {
6110+ TransactionId xwait ;
6111+ uint16 infomask ;
60656112
6066- INJECTION_POINT ("inplace-before-pin" );
6067- buffer = ReadBuffer (relation , ItemPointerGetBlockNumber (& (tuple -> t_self )));
6068- LockBuffer (buffer , BUFFER_LOCK_EXCLUSIVE );
6069- page = (Page ) BufferGetPage (buffer );
6113+ xwait = HeapTupleHeaderGetRawXmax (oldtup .t_data );
6114+ infomask = oldtup .t_data -> t_infomask ;
60706115
6071- offnum = ItemPointerGetOffsetNumber (& (tuple -> t_self ));
6072- if (PageGetMaxOffsetNumber (page ) >= offnum )
6073- lp = PageGetItemId (page , offnum );
6116+ if (infomask & HEAP_XMAX_IS_MULTI )
6117+ {
6118+ LockTupleMode lockmode = LockTupleNoKeyExclusive ;
6119+ MultiXactStatus mxact_status = MultiXactStatusNoKeyUpdate ;
6120+ int remain ;
6121+ bool current_is_member ;
60746122
6075- if (PageGetMaxOffsetNumber (page ) < offnum || !ItemIdIsNormal (lp ))
6076- elog (ERROR , "invalid lp" );
6123+ if (DoesMultiXactIdConflict ((MultiXactId ) xwait , infomask ,
6124+ lockmode , & current_is_member ))
6125+ {
6126+ LockBuffer (buffer , BUFFER_LOCK_UNLOCK );
6127+ ret = false;
6128+ MultiXactIdWait ((MultiXactId ) xwait , mxact_status , infomask ,
6129+ relation , & oldtup .t_self , XLTW_Update ,
6130+ & remain );
6131+ }
6132+ else
6133+ ret = true;
6134+ }
6135+ else if (TransactionIdIsCurrentTransactionId (xwait ))
6136+ ret = true;
6137+ else if (HEAP_XMAX_IS_KEYSHR_LOCKED (infomask ))
6138+ ret = true;
6139+ else
6140+ {
6141+ LockBuffer (buffer , BUFFER_LOCK_UNLOCK );
6142+ ret = false;
6143+ XactLockTableWait (xwait , relation , & oldtup .t_self ,
6144+ XLTW_Update );
6145+ }
6146+ }
6147+ else
6148+ {
6149+ ret = (result == TM_Ok );
6150+ if (!ret )
6151+ {
6152+ LockBuffer (buffer , BUFFER_LOCK_UNLOCK );
6153+ }
6154+ }
60776155
6078- htup = (HeapTupleHeader ) PageGetItem (page , lp );
6156+ /*
6157+ * GetCatalogSnapshot() relies on invalidation messages to know when to
6158+ * take a new snapshot. COMMIT of xwait is responsible for sending the
6159+ * invalidation. We're not acquiring heavyweight locks sufficient to
6160+ * block if not yet sent, so we must take a new snapshot to ensure a later
6161+ * attempt has a fair chance. While we don't need this if xwait aborted,
6162+ * don't bother optimizing that.
6163+ */
6164+ if (!ret )
6165+ InvalidateCatalogSnapshot ();
6166+ return ret ;
6167+ }
6168+
6169+ /*
6170+ * heap_inplace_update_and_unlock - core of systable_inplace_update_finish
6171+ *
6172+ * The tuple cannot change size, and therefore its header fields and null
6173+ * bitmap (if any) don't change either.
6174+ */
6175+ void
6176+ heap_inplace_update_and_unlock (Relation relation ,
6177+ HeapTuple oldtup , HeapTuple tuple ,
6178+ Buffer buffer )
6179+ {
6180+ HeapTupleHeader htup = oldtup -> t_data ;
6181+ uint32 oldlen ;
6182+ uint32 newlen ;
60796183
6080- oldlen = ItemIdGetLength (lp ) - htup -> t_hoff ;
6184+ Assert (ItemPointerEquals (& oldtup -> t_self , & tuple -> t_self ));
6185+ oldlen = oldtup -> t_len - htup -> t_hoff ;
60816186 newlen = tuple -> t_len - tuple -> t_data -> t_hoff ;
60826187 if (oldlen != newlen || htup -> t_hoff != tuple -> t_data -> t_hoff )
60836188 elog (ERROR , "wrong tuple length" );
@@ -6089,6 +6194,19 @@ heap_inplace_update(Relation relation, HeapTuple tuple)
60896194 (char * ) tuple -> t_data + tuple -> t_data -> t_hoff ,
60906195 newlen );
60916196
6197+ /*----------
6198+ * XXX A crash here can allow datfrozenxid() to get ahead of relfrozenxid:
6199+ *
6200+ * ["D" is a VACUUM (ONLY_DATABASE_STATS)]
6201+ * ["R" is a VACUUM tbl]
6202+ * D: vac_update_datfrozenid() -> systable_beginscan(pg_class)
6203+ * D: systable_getnext() returns pg_class tuple of tbl
6204+ * R: memcpy() into pg_class tuple of tbl
6205+ * D: raise pg_database.datfrozenxid, XLogInsert(), finish
6206+ * [crash]
6207+ * [recovery restores datfrozenxid w/o relfrozenxid]
6208+ */
6209+
60926210 MarkBufferDirty (buffer );
60936211
60946212 /* XLOG stuff */
@@ -6109,23 +6227,35 @@ heap_inplace_update(Relation relation, HeapTuple tuple)
61096227
61106228 recptr = XLogInsert (RM_HEAP_ID , XLOG_HEAP_INPLACE );
61116229
6112- PageSetLSN (page , recptr );
6230+ PageSetLSN (BufferGetPage ( buffer ) , recptr );
61136231 }
61146232
61156233 END_CRIT_SECTION ();
61166234
6117- UnlockReleaseBuffer ( buffer );
6235+ heap_inplace_unlock ( relation , oldtup , buffer );
61186236
61196237 /*
61206238 * Send out shared cache inval if necessary. Note that because we only
61216239 * pass the new version of the tuple, this mustn't be used for any
61226240 * operations that could change catcache lookup keys. But we aren't
61236241 * bothering with index updates either, so that's true a fortiori.
6242+ *
6243+ * XXX ROLLBACK discards the invalidation. See test inplace-inval.spec.
61246244 */
61256245 if (!IsBootstrapProcessingMode ())
61266246 CacheInvalidateHeapTuple (relation , tuple , NULL );
61276247}
61286248
6249+ /*
6250+ * heap_inplace_unlock - reverse of heap_inplace_lock
6251+ */
6252+ void
6253+ heap_inplace_unlock (Relation relation ,
6254+ HeapTuple oldtup , Buffer buffer )
6255+ {
6256+ LockBuffer (buffer , BUFFER_LOCK_UNLOCK );
6257+ }
6258+
61296259#define FRM_NOOP 0x0001
61306260#define FRM_INVALIDATE_XMAX 0x0002
61316261#define FRM_RETURN_IS_XID 0x0004
0 commit comments