Skip to content

Commit bf3dba5

Browse files
committed
Fix regression with slot invalidation checks
This commit reverts 818fefd, that has been introduced to address a an instability in some of the TAP tests due to the presence of random standby snapshot WAL records, when slots are invalidated by InvalidatePossiblyObsoleteSlot(). Anyway, this commit had also the consequence of introducing a behavior regression. After 818fefd, the code may determine that a slot needs to be invalidated while it may not require one: the slot may have moved from a conflicting state to a non-conflicting state between the moment when the mutex is released and the moment when we recheck the slot, in InvalidatePossiblyObsoleteSlot(). Hence, the invalidations may be more aggressive than they actually have to. 105b2cb has tackled the test instability in a way that should be hopefully sufficient for the buildfarm, even for slow members: - In v18, the test relies on an injection point that bypasses the creation of the random records generated for standby snapshots, eliminating the random factor that impacted the test. This option was not available when 818fefd was discussed. - In v16 and v17, the problem was bypassed by disallowing a slot to become active in some of the scenarios tested. While on it, this commit adds a comment to document that it is fine for a recheck to use xmin and LSN values stored in the slot, without storing and reusing them across multiple checks. Reported-by: "suyu.cmj" <mengjuan.cmj@alibaba-inc.com> Author: Bertrand Drouvot <bertranddrouvot.pg@gmail.com> Reviewed-by: Masahiko Sawada <sawada.mshk@gmail.com> Reviewed-by: Amit Kapila <amit.kapila16@gmail.com> Discussion: https://postgr.es/m/f492465f-657e-49af-8317-987460cb68b0.mengjuan.cmj@alibaba-inc.com Backpatch-through: 16
1 parent ef6168b commit bf3dba5

File tree

1 file changed

+24
-46
lines changed

1 file changed

+24
-46
lines changed

src/backend/replication/slot.c

Lines changed: 24 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1705,17 +1705,16 @@ static ReplicationSlotInvalidationCause
17051705
DetermineSlotInvalidationCause(uint32 possible_causes, ReplicationSlot *s,
17061706
XLogRecPtr oldestLSN, Oid dboid,
17071707
TransactionId snapshotConflictHorizon,
1708-
TransactionId initial_effective_xmin,
1709-
TransactionId initial_catalog_effective_xmin,
1710-
XLogRecPtr initial_restart_lsn,
17111708
TimestampTz *inactive_since, TimestampTz now)
17121709
{
17131710
Assert(possible_causes != RS_INVAL_NONE);
17141711

17151712
if (possible_causes & RS_INVAL_WAL_REMOVED)
17161713
{
1717-
if (initial_restart_lsn != InvalidXLogRecPtr &&
1718-
initial_restart_lsn < oldestLSN)
1714+
XLogRecPtr restart_lsn = s->data.restart_lsn;
1715+
1716+
if (restart_lsn != InvalidXLogRecPtr &&
1717+
restart_lsn < oldestLSN)
17191718
return RS_INVAL_WAL_REMOVED;
17201719
}
17211720

@@ -1725,12 +1724,15 @@ DetermineSlotInvalidationCause(uint32 possible_causes, ReplicationSlot *s,
17251724
if (SlotIsLogical(s) &&
17261725
(dboid == InvalidOid || dboid == s->data.database))
17271726
{
1728-
if (TransactionIdIsValid(initial_effective_xmin) &&
1729-
TransactionIdPrecedesOrEquals(initial_effective_xmin,
1727+
TransactionId effective_xmin = s->effective_xmin;
1728+
TransactionId catalog_effective_xmin = s->effective_catalog_xmin;
1729+
1730+
if (TransactionIdIsValid(effective_xmin) &&
1731+
TransactionIdPrecedesOrEquals(effective_xmin,
17301732
snapshotConflictHorizon))
17311733
return RS_INVAL_HORIZON;
1732-
else if (TransactionIdIsValid(initial_catalog_effective_xmin) &&
1733-
TransactionIdPrecedesOrEquals(initial_catalog_effective_xmin,
1734+
else if (TransactionIdIsValid(catalog_effective_xmin) &&
1735+
TransactionIdPrecedesOrEquals(catalog_effective_xmin,
17341736
snapshotConflictHorizon))
17351737
return RS_INVAL_HORIZON;
17361738
}
@@ -1799,11 +1801,6 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes,
17991801
{
18001802
int last_signaled_pid = 0;
18011803
bool released_lock = false;
1802-
bool terminated = false;
1803-
TransactionId initial_effective_xmin = InvalidTransactionId;
1804-
TransactionId initial_catalog_effective_xmin = InvalidTransactionId;
1805-
XLogRecPtr initial_restart_lsn = InvalidXLogRecPtr;
1806-
ReplicationSlotInvalidationCause invalidation_cause_prev PG_USED_FOR_ASSERTS_ONLY = RS_INVAL_NONE;
18071804
TimestampTz inactive_since = 0;
18081805

18091806
for (;;)
@@ -1846,42 +1843,12 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes,
18461843

18471844
/* we do nothing if the slot is already invalid */
18481845
if (s->data.invalidated == RS_INVAL_NONE)
1849-
{
1850-
/*
1851-
* The slot's mutex will be released soon, and it is possible that
1852-
* those values change since the process holding the slot has been
1853-
* terminated (if any), so record them here to ensure that we
1854-
* would report the correct invalidation cause.
1855-
*
1856-
* Unlike other slot attributes, slot's inactive_since can't be
1857-
* changed until the acquired slot is released or the owning
1858-
* process is terminated. So, the inactive slot can only be
1859-
* invalidated immediately without being terminated.
1860-
*/
1861-
if (!terminated)
1862-
{
1863-
initial_restart_lsn = s->data.restart_lsn;
1864-
initial_effective_xmin = s->effective_xmin;
1865-
initial_catalog_effective_xmin = s->effective_catalog_xmin;
1866-
}
1867-
18681846
invalidation_cause = DetermineSlotInvalidationCause(possible_causes,
18691847
s, oldestLSN,
18701848
dboid,
18711849
snapshotConflictHorizon,
1872-
initial_effective_xmin,
1873-
initial_catalog_effective_xmin,
1874-
initial_restart_lsn,
18751850
&inactive_since,
18761851
now);
1877-
}
1878-
1879-
/*
1880-
* The invalidation cause recorded previously should not change while
1881-
* the process owning the slot (if any) has been terminated.
1882-
*/
1883-
Assert(!(invalidation_cause_prev != RS_INVAL_NONE && terminated &&
1884-
invalidation_cause_prev != invalidation_cause));
18851852

18861853
/* if there's no invalidation, we're done */
18871854
if (invalidation_cause == RS_INVAL_NONE)
@@ -1899,6 +1866,11 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes,
18991866
* If the slot can be acquired, do so and mark it invalidated
19001867
* immediately. Otherwise we'll signal the owning process, below, and
19011868
* retry.
1869+
*
1870+
* Note: Unlike other slot attributes, slot's inactive_since can't be
1871+
* changed until the acquired slot is released or the owning process
1872+
* is terminated. So, the inactive slot can only be invalidated
1873+
* immediately without being terminated.
19021874
*/
19031875
if (active_pid == 0)
19041876
{
@@ -1973,8 +1945,6 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes,
19731945
(void) kill(active_pid, SIGTERM);
19741946

19751947
last_signaled_pid = active_pid;
1976-
terminated = true;
1977-
invalidation_cause_prev = invalidation_cause;
19781948
}
19791949

19801950
/* Wait until the slot is released. */
@@ -1985,6 +1955,14 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes,
19851955
* Re-acquire lock and start over; we expect to invalidate the
19861956
* slot next time (unless another process acquires the slot in the
19871957
* meantime).
1958+
*
1959+
* Note: It is possible for a slot to advance its restart_lsn or
1960+
* xmin values sufficiently between when we release the mutex and
1961+
* when we recheck, moving from a conflicting state to a non
1962+
* conflicting state. This is intentional and safe: if the slot
1963+
* has caught up while we're busy here, the resources we were
1964+
* concerned about (WAL segments or tuples) have not yet been
1965+
* removed, and there's no reason to invalidate the slot.
19881966
*/
19891967
LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
19901968
continue;

0 commit comments

Comments
 (0)