/
procarray.c
5224 lines (4566 loc) · 162 KB
/
procarray.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*-------------------------------------------------------------------------
*
* procarray.c
* POSTGRES process array code.
*
*
* This module maintains arrays of PGPROC substructures, as well as associated
* arrays in ProcGlobal, for all active backends. Although there are several
* uses for this, the principal one is as a means of determining the set of
* currently running transactions.
*
* Because of various subtle race conditions it is critical that a backend
* hold the correct locks while setting or clearing its xid (in
* ProcGlobal->xids[]/MyProc->xid). See notes in
* src/backend/access/transam/README.
*
* The process arrays now also include structures representing prepared
* transactions. The xid and subxids fields of these are valid, as are the
* myProcLocks lists. They can be distinguished from regular backend PGPROCs
* at need by checking for pid == 0.
*
* During hot standby, we also keep a list of XIDs representing transactions
* that are known to be running on the primary (or more precisely, were running
* as of the current point in the WAL stream). This list is kept in the
* KnownAssignedXids array, and is updated by watching the sequence of
* arriving XIDs. This is necessary because if we leave those XIDs out of
* snapshots taken for standby queries, then they will appear to be already
* complete, leading to MVCC failures. Note that in hot standby, the PGPROC
* array represents standby processes, which by definition are not running
* transactions that have XIDs.
*
* It is perhaps possible for a backend on the primary to terminate without
* writing an abort record for its transaction. While that shouldn't really
* happen, it would tie up KnownAssignedXids indefinitely, so we protect
* ourselves by pruning the array when a valid list of running XIDs arrives.
*
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/storage/ipc/procarray.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <signal.h>
#include "access/clog.h"
#include "access/subtrans.h"
#include "access/transam.h"
#include "access/twophase.h"
#include "access/xact.h"
#include "access/xlogutils.h"
#include "catalog/catalog.h"
#include "catalog/pg_authid.h"
#include "commands/dbcommands.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "port/pg_lfind.h"
#include "storage/proc.h"
#include "storage/procarray.h"
#include "storage/spin.h"
#include "utils/acl.h"
#include "utils/builtins.h"
#include "utils/rel.h"
#include "utils/snapmgr.h"
#define UINT32_ACCESS_ONCE(var) ((uint32)(*((volatile uint32 *)&(var))))
/* Our shared memory area */
typedef struct ProcArrayStruct
{
int numProcs; /* number of valid procs entries */
int maxProcs; /* allocated size of procs array */
/*
* Known assigned XIDs handling
*/
int maxKnownAssignedXids; /* allocated size of array */
int numKnownAssignedXids; /* current # of valid entries */
int tailKnownAssignedXids; /* index of oldest valid element */
int headKnownAssignedXids; /* index of newest element, + 1 */
slock_t known_assigned_xids_lck; /* protects head/tail pointers */
/*
* Highest subxid that has been removed from KnownAssignedXids array to
* prevent overflow; or InvalidTransactionId if none. We track this for
* similar reasons to tracking overflowing cached subxids in PGPROC
* entries. Must hold exclusive ProcArrayLock to change this, and shared
* lock to read it.
*/
TransactionId lastOverflowedXid;
/* oldest xmin of any replication slot */
TransactionId replication_slot_xmin;
/* oldest catalog xmin of any replication slot */
TransactionId replication_slot_catalog_xmin;
/* indexes into allProcs[], has PROCARRAY_MAXPROCS entries */
int pgprocnos[FLEXIBLE_ARRAY_MEMBER];
} ProcArrayStruct;
/*
* State for the GlobalVisTest* family of functions. Those functions can
* e.g. be used to decide if a deleted row can be removed without violating
* MVCC semantics: If the deleted row's xmax is not considered to be running
* by anyone, the row can be removed.
*
* To avoid slowing down GetSnapshotData(), we don't calculate a precise
* cutoff XID while building a snapshot (looking at the frequently changing
* xmins scales badly). Instead we compute two boundaries while building the
* snapshot:
*
* 1) definitely_needed, indicating that rows deleted by XIDs >=
* definitely_needed are definitely still visible.
*
* 2) maybe_needed, indicating that rows deleted by XIDs < maybe_needed can
* definitely be removed
*
* When testing an XID that falls in between the two (i.e. XID >= maybe_needed
* && XID < definitely_needed), the boundaries can be recomputed (using
* ComputeXidHorizons()) to get a more accurate answer. This is cheaper than
* maintaining an accurate value all the time.
*
* As it is not cheap to compute accurate boundaries, we limit the number of
* times that happens in short succession. See GlobalVisTestShouldUpdate().
*
*
* There are three backend lifetime instances of this struct, optimized for
* different types of relations. As e.g. a normal user defined table in one
* database is inaccessible to backends connected to another database, a test
* specific to a relation can be more aggressive than a test for a shared
* relation. Currently we track four different states:
*
* 1) GlobalVisSharedRels, which only considers an XID's
* effects visible-to-everyone if neither snapshots in any database, nor a
* replication slot's xmin, nor a replication slot's catalog_xmin might
* still consider XID as running.
*
* 2) GlobalVisCatalogRels, which only considers an XID's
* effects visible-to-everyone if neither snapshots in the current
* database, nor a replication slot's xmin, nor a replication slot's
* catalog_xmin might still consider XID as running.
*
* I.e. the difference to GlobalVisSharedRels is that
* snapshot in other databases are ignored.
*
* 3) GlobalVisDataRels, which only considers an XID's
* effects visible-to-everyone if neither snapshots in the current
* database, nor a replication slot's xmin consider XID as running.
*
* I.e. the difference to GlobalVisCatalogRels is that
* replication slot's catalog_xmin is not taken into account.
*
* 4) GlobalVisTempRels, which only considers the current session, as temp
* tables are not visible to other sessions.
*
* GlobalVisTestFor(relation) returns the appropriate state
* for the relation.
*
* The boundaries are FullTransactionIds instead of TransactionIds to avoid
* wraparound dangers. There e.g. would otherwise exist no procarray state to
* prevent maybe_needed to become old enough after the GetSnapshotData()
* call.
*
* The typedef is in the header.
*/
struct GlobalVisState
{
/* XIDs >= are considered running by some backend */
FullTransactionId definitely_needed;
/* XIDs < are not considered to be running by any backend */
FullTransactionId maybe_needed;
};
/*
* Result of ComputeXidHorizons().
*/
typedef struct ComputeXidHorizonsResult
{
/*
* The value of ShmemVariableCache->latestCompletedXid when
* ComputeXidHorizons() held ProcArrayLock.
*/
FullTransactionId latest_completed;
/*
* The same for procArray->replication_slot_xmin and.
* procArray->replication_slot_catalog_xmin.
*/
TransactionId slot_xmin;
TransactionId slot_catalog_xmin;
/*
* Oldest xid that any backend might still consider running. This needs to
* include processes running VACUUM, in contrast to the normal visibility
* cutoffs, as vacuum needs to be able to perform pg_subtrans lookups when
* determining visibility, but doesn't care about rows above its xmin to
* be removed.
*
* This likely should only be needed to determine whether pg_subtrans can
* be truncated. It currently includes the effects of replication slots,
* for historical reasons. But that could likely be changed.
*/
TransactionId oldest_considered_running;
/*
* Oldest xid for which deleted tuples need to be retained in shared
* tables.
*
* This includes the effects of replication slots. If that's not desired,
* look at shared_oldest_nonremovable_raw;
*/
TransactionId shared_oldest_nonremovable;
/*
* Oldest xid that may be necessary to retain in shared tables. This is
* the same as shared_oldest_nonremovable, except that is not affected by
* replication slot's catalog_xmin.
*
* This is mainly useful to be able to send the catalog_xmin to upstream
* streaming replication servers via hot_standby_feedback, so they can
* apply the limit only when accessing catalog tables.
*/
TransactionId shared_oldest_nonremovable_raw;
/*
* Oldest xid for which deleted tuples need to be retained in non-shared
* catalog tables.
*/
TransactionId catalog_oldest_nonremovable;
/*
* Oldest xid for which deleted tuples need to be retained in normal user
* defined tables.
*/
TransactionId data_oldest_nonremovable;
/*
* Oldest xid for which deleted tuples need to be retained in this
* session's temporary tables.
*/
TransactionId temp_oldest_nonremovable;
} ComputeXidHorizonsResult;
/*
* Return value for GlobalVisHorizonKindForRel().
*/
typedef enum GlobalVisHorizonKind
{
VISHORIZON_SHARED,
VISHORIZON_CATALOG,
VISHORIZON_DATA,
VISHORIZON_TEMP
} GlobalVisHorizonKind;
/*
* Reason codes for KnownAssignedXidsCompress().
*/
typedef enum KAXCompressReason
{
KAX_NO_SPACE, /* need to free up space at array end */
KAX_PRUNE, /* we just pruned old entries */
KAX_TRANSACTION_END, /* we just committed/removed some XIDs */
KAX_STARTUP_PROCESS_IDLE /* startup process is about to sleep */
} KAXCompressReason;
static ProcArrayStruct *procArray;
static PGPROC *allProcs;
/*
* Cache to reduce overhead of repeated calls to TransactionIdIsInProgress()
*/
static TransactionId cachedXidIsNotInProgress = InvalidTransactionId;
/*
* Bookkeeping for tracking emulated transactions in recovery
*/
static TransactionId *KnownAssignedXids;
static bool *KnownAssignedXidsValid;
static TransactionId latestObservedXid = InvalidTransactionId;
/*
* If we're in STANDBY_SNAPSHOT_PENDING state, standbySnapshotPendingXmin is
* the highest xid that might still be running that we don't have in
* KnownAssignedXids.
*/
static TransactionId standbySnapshotPendingXmin;
/*
* State for visibility checks on different types of relations. See struct
* GlobalVisState for details. As shared, catalog, normal and temporary
* relations can have different horizons, one such state exists for each.
*/
static GlobalVisState GlobalVisSharedRels;
static GlobalVisState GlobalVisCatalogRels;
static GlobalVisState GlobalVisDataRels;
static GlobalVisState GlobalVisTempRels;
/*
* This backend's RecentXmin at the last time the accurate xmin horizon was
* recomputed, or InvalidTransactionId if it has not. Used to limit how many
* times accurate horizons are recomputed. See GlobalVisTestShouldUpdate().
*/
static TransactionId ComputeXidHorizonsResultLastXmin;
#ifdef XIDCACHE_DEBUG
/* counters for XidCache measurement */
static long xc_by_recent_xmin = 0;
static long xc_by_known_xact = 0;
static long xc_by_my_xact = 0;
static long xc_by_latest_xid = 0;
static long xc_by_main_xid = 0;
static long xc_by_child_xid = 0;
static long xc_by_known_assigned = 0;
static long xc_no_overflow = 0;
static long xc_slow_answer = 0;
#define xc_by_recent_xmin_inc() (xc_by_recent_xmin++)
#define xc_by_known_xact_inc() (xc_by_known_xact++)
#define xc_by_my_xact_inc() (xc_by_my_xact++)
#define xc_by_latest_xid_inc() (xc_by_latest_xid++)
#define xc_by_main_xid_inc() (xc_by_main_xid++)
#define xc_by_child_xid_inc() (xc_by_child_xid++)
#define xc_by_known_assigned_inc() (xc_by_known_assigned++)
#define xc_no_overflow_inc() (xc_no_overflow++)
#define xc_slow_answer_inc() (xc_slow_answer++)
static void DisplayXidCache(void);
#else /* !XIDCACHE_DEBUG */
#define xc_by_recent_xmin_inc() ((void) 0)
#define xc_by_known_xact_inc() ((void) 0)
#define xc_by_my_xact_inc() ((void) 0)
#define xc_by_latest_xid_inc() ((void) 0)
#define xc_by_main_xid_inc() ((void) 0)
#define xc_by_child_xid_inc() ((void) 0)
#define xc_by_known_assigned_inc() ((void) 0)
#define xc_no_overflow_inc() ((void) 0)
#define xc_slow_answer_inc() ((void) 0)
#endif /* XIDCACHE_DEBUG */
/* Primitives for KnownAssignedXids array handling for standby */
static void KnownAssignedXidsCompress(KAXCompressReason reason, bool haveLock);
static void KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid,
bool exclusive_lock);
static bool KnownAssignedXidsSearch(TransactionId xid, bool remove);
static bool KnownAssignedXidExists(TransactionId xid);
static void KnownAssignedXidsRemove(TransactionId xid);
static void KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids,
TransactionId *subxids);
static void KnownAssignedXidsRemovePreceding(TransactionId removeXid);
static int KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax);
static int KnownAssignedXidsGetAndSetXmin(TransactionId *xarray,
TransactionId *xmin,
TransactionId xmax);
static TransactionId KnownAssignedXidsGetOldestXmin(void);
static void KnownAssignedXidsDisplay(int trace_level);
static void KnownAssignedXidsReset(void);
static inline void ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid);
static void ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid);
static void MaintainLatestCompletedXid(TransactionId latestXid);
static void MaintainLatestCompletedXidRecovery(TransactionId latestXid);
static inline FullTransactionId FullXidRelativeTo(FullTransactionId rel,
TransactionId xid);
static void GlobalVisUpdateApply(ComputeXidHorizonsResult *horizons);
/*
* Report shared-memory space needed by CreateSharedProcArray.
*/
Size
ProcArrayShmemSize(void)
{
Size size;
/* Size of the ProcArray structure itself */
#define PROCARRAY_MAXPROCS (MaxBackends + max_prepared_xacts)
size = offsetof(ProcArrayStruct, pgprocnos);
size = add_size(size, mul_size(sizeof(int), PROCARRAY_MAXPROCS));
/*
* During Hot Standby processing we have a data structure called
* KnownAssignedXids, created in shared memory. Local data structures are
* also created in various backends during GetSnapshotData(),
* TransactionIdIsInProgress() and GetRunningTransactionData(). All of the
* main structures created in those functions must be identically sized,
* since we may at times copy the whole of the data structures around. We
* refer to this size as TOTAL_MAX_CACHED_SUBXIDS.
*
* Ideally we'd only create this structure if we were actually doing hot
* standby in the current run, but we don't know that yet at the time
* shared memory is being set up.
*/
#define TOTAL_MAX_CACHED_SUBXIDS \
((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS)
if (EnableHotStandby)
{
size = add_size(size,
mul_size(sizeof(TransactionId),
TOTAL_MAX_CACHED_SUBXIDS));
size = add_size(size,
mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS));
}
return size;
}
/*
* Initialize the shared PGPROC array during postmaster startup.
*/
void
CreateSharedProcArray(void)
{
bool found;
/* Create or attach to the ProcArray shared structure */
procArray = (ProcArrayStruct *)
ShmemInitStruct("Proc Array",
add_size(offsetof(ProcArrayStruct, pgprocnos),
mul_size(sizeof(int),
PROCARRAY_MAXPROCS)),
&found);
if (!found)
{
/*
* We're the first - initialize.
*/
procArray->numProcs = 0;
procArray->maxProcs = PROCARRAY_MAXPROCS;
procArray->maxKnownAssignedXids = TOTAL_MAX_CACHED_SUBXIDS;
procArray->numKnownAssignedXids = 0;
procArray->tailKnownAssignedXids = 0;
procArray->headKnownAssignedXids = 0;
SpinLockInit(&procArray->known_assigned_xids_lck);
procArray->lastOverflowedXid = InvalidTransactionId;
procArray->replication_slot_xmin = InvalidTransactionId;
procArray->replication_slot_catalog_xmin = InvalidTransactionId;
ShmemVariableCache->xactCompletionCount = 1;
}
allProcs = ProcGlobal->allProcs;
/* Create or attach to the KnownAssignedXids arrays too, if needed */
if (EnableHotStandby)
{
KnownAssignedXids = (TransactionId *)
ShmemInitStruct("KnownAssignedXids",
mul_size(sizeof(TransactionId),
TOTAL_MAX_CACHED_SUBXIDS),
&found);
KnownAssignedXidsValid = (bool *)
ShmemInitStruct("KnownAssignedXidsValid",
mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS),
&found);
}
}
/*
* Add the specified PGPROC to the shared array.
*/
void
ProcArrayAdd(PGPROC *proc)
{
ProcArrayStruct *arrayP = procArray;
int index;
int movecount;
/* See ProcGlobal comment explaining why both locks are held */
LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
if (arrayP->numProcs >= arrayP->maxProcs)
{
/*
* Oops, no room. (This really shouldn't happen, since there is a
* fixed supply of PGPROC structs too, and so we should have failed
* earlier.)
*/
ereport(FATAL,
(errcode(ERRCODE_TOO_MANY_CONNECTIONS),
errmsg("sorry, too many clients already")));
}
/*
* Keep the procs array sorted by (PGPROC *) so that we can utilize
* locality of references much better. This is useful while traversing the
* ProcArray because there is an increased likelihood of finding the next
* PGPROC structure in the cache.
*
* Since the occurrence of adding/removing a proc is much lower than the
* access to the ProcArray itself, the overhead should be marginal
*/
for (index = 0; index < arrayP->numProcs; index++)
{
int procno PG_USED_FOR_ASSERTS_ONLY = arrayP->pgprocnos[index];
Assert(procno >= 0 && procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS));
Assert(allProcs[procno].pgxactoff == index);
/* If we have found our right position in the array, break */
if (arrayP->pgprocnos[index] > proc->pgprocno)
break;
}
movecount = arrayP->numProcs - index;
memmove(&arrayP->pgprocnos[index + 1],
&arrayP->pgprocnos[index],
movecount * sizeof(*arrayP->pgprocnos));
memmove(&ProcGlobal->xids[index + 1],
&ProcGlobal->xids[index],
movecount * sizeof(*ProcGlobal->xids));
memmove(&ProcGlobal->subxidStates[index + 1],
&ProcGlobal->subxidStates[index],
movecount * sizeof(*ProcGlobal->subxidStates));
memmove(&ProcGlobal->statusFlags[index + 1],
&ProcGlobal->statusFlags[index],
movecount * sizeof(*ProcGlobal->statusFlags));
arrayP->pgprocnos[index] = proc->pgprocno;
proc->pgxactoff = index;
ProcGlobal->xids[index] = proc->xid;
ProcGlobal->subxidStates[index] = proc->subxidStatus;
ProcGlobal->statusFlags[index] = proc->statusFlags;
arrayP->numProcs++;
/* adjust pgxactoff for all following PGPROCs */
index++;
for (; index < arrayP->numProcs; index++)
{
int procno = arrayP->pgprocnos[index];
Assert(procno >= 0 && procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS));
Assert(allProcs[procno].pgxactoff == index - 1);
allProcs[procno].pgxactoff = index;
}
/*
* Release in reversed acquisition order, to reduce frequency of having to
* wait for XidGenLock while holding ProcArrayLock.
*/
LWLockRelease(XidGenLock);
LWLockRelease(ProcArrayLock);
}
/*
* Remove the specified PGPROC from the shared array.
*
* When latestXid is a valid XID, we are removing a live 2PC gxact from the
* array, and thus causing it to appear as "not running" anymore. In this
* case we must advance latestCompletedXid. (This is essentially the same
* as ProcArrayEndTransaction followed by removal of the PGPROC, but we take
* the ProcArrayLock only once, and don't damage the content of the PGPROC;
* twophase.c depends on the latter.)
*/
void
ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
{
ProcArrayStruct *arrayP = procArray;
int myoff;
int movecount;
#ifdef XIDCACHE_DEBUG
/* dump stats at backend shutdown, but not prepared-xact end */
if (proc->pid != 0)
DisplayXidCache();
#endif
/* See ProcGlobal comment explaining why both locks are held */
LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
myoff = proc->pgxactoff;
Assert(myoff >= 0 && myoff < arrayP->numProcs);
Assert(ProcGlobal->allProcs[arrayP->pgprocnos[myoff]].pgxactoff == myoff);
if (TransactionIdIsValid(latestXid))
{
Assert(TransactionIdIsValid(ProcGlobal->xids[myoff]));
/* Advance global latestCompletedXid while holding the lock */
MaintainLatestCompletedXid(latestXid);
/* Same with xactCompletionCount */
ShmemVariableCache->xactCompletionCount++;
ProcGlobal->xids[myoff] = InvalidTransactionId;
ProcGlobal->subxidStates[myoff].overflowed = false;
ProcGlobal->subxidStates[myoff].count = 0;
}
else
{
/* Shouldn't be trying to remove a live transaction here */
Assert(!TransactionIdIsValid(ProcGlobal->xids[myoff]));
}
Assert(!TransactionIdIsValid(ProcGlobal->xids[myoff]));
Assert(ProcGlobal->subxidStates[myoff].count == 0);
Assert(ProcGlobal->subxidStates[myoff].overflowed == false);
ProcGlobal->statusFlags[myoff] = 0;
/* Keep the PGPROC array sorted. See notes above */
movecount = arrayP->numProcs - myoff - 1;
memmove(&arrayP->pgprocnos[myoff],
&arrayP->pgprocnos[myoff + 1],
movecount * sizeof(*arrayP->pgprocnos));
memmove(&ProcGlobal->xids[myoff],
&ProcGlobal->xids[myoff + 1],
movecount * sizeof(*ProcGlobal->xids));
memmove(&ProcGlobal->subxidStates[myoff],
&ProcGlobal->subxidStates[myoff + 1],
movecount * sizeof(*ProcGlobal->subxidStates));
memmove(&ProcGlobal->statusFlags[myoff],
&ProcGlobal->statusFlags[myoff + 1],
movecount * sizeof(*ProcGlobal->statusFlags));
arrayP->pgprocnos[arrayP->numProcs - 1] = -1; /* for debugging */
arrayP->numProcs--;
/*
* Adjust pgxactoff of following procs for removed PGPROC (note that
* numProcs already has been decremented).
*/
for (int index = myoff; index < arrayP->numProcs; index++)
{
int procno = arrayP->pgprocnos[index];
Assert(procno >= 0 && procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS));
Assert(allProcs[procno].pgxactoff - 1 == index);
allProcs[procno].pgxactoff = index;
}
/*
* Release in reversed acquisition order, to reduce frequency of having to
* wait for XidGenLock while holding ProcArrayLock.
*/
LWLockRelease(XidGenLock);
LWLockRelease(ProcArrayLock);
}
/*
* ProcArrayEndTransaction -- mark a transaction as no longer running
*
* This is used interchangeably for commit and abort cases. The transaction
* commit/abort must already be reported to WAL and pg_xact.
*
* proc is currently always MyProc, but we pass it explicitly for flexibility.
* latestXid is the latest Xid among the transaction's main XID and
* subtransactions, or InvalidTransactionId if it has no XID. (We must ask
* the caller to pass latestXid, instead of computing it from the PGPROC's
* contents, because the subxid information in the PGPROC might be
* incomplete.)
*/
void
ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
{
if (TransactionIdIsValid(latestXid))
{
/*
* We must lock ProcArrayLock while clearing our advertised XID, so
* that we do not exit the set of "running" transactions while someone
* else is taking a snapshot. See discussion in
* src/backend/access/transam/README.
*/
Assert(TransactionIdIsValid(proc->xid));
/*
* If we can immediately acquire ProcArrayLock, we clear our own XID
* and release the lock. If not, use group XID clearing to improve
* efficiency.
*/
if (LWLockConditionalAcquire(ProcArrayLock, LW_EXCLUSIVE))
{
ProcArrayEndTransactionInternal(proc, latestXid);
LWLockRelease(ProcArrayLock);
}
else
ProcArrayGroupClearXid(proc, latestXid);
}
else
{
/*
* If we have no XID, we don't need to lock, since we won't affect
* anyone else's calculation of a snapshot. We might change their
* estimate of global xmin, but that's OK.
*/
Assert(!TransactionIdIsValid(proc->xid));
Assert(proc->subxidStatus.count == 0);
Assert(!proc->subxidStatus.overflowed);
proc->lxid = InvalidLocalTransactionId;
proc->xmin = InvalidTransactionId;
/* be sure this is cleared in abort */
proc->delayChkptFlags = 0;
proc->recoveryConflictPending = false;
/* must be cleared with xid/xmin: */
/* avoid unnecessarily dirtying shared cachelines */
if (proc->statusFlags & PROC_VACUUM_STATE_MASK)
{
Assert(!LWLockHeldByMe(ProcArrayLock));
LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
Assert(proc->statusFlags == ProcGlobal->statusFlags[proc->pgxactoff]);
proc->statusFlags &= ~PROC_VACUUM_STATE_MASK;
ProcGlobal->statusFlags[proc->pgxactoff] = proc->statusFlags;
LWLockRelease(ProcArrayLock);
}
}
}
/*
* Mark a write transaction as no longer running.
*
* We don't do any locking here; caller must handle that.
*/
static inline void
ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid)
{
int pgxactoff = proc->pgxactoff;
/*
* Note: we need exclusive lock here because we're going to change other
* processes' PGPROC entries.
*/
Assert(LWLockHeldByMeInMode(ProcArrayLock, LW_EXCLUSIVE));
Assert(TransactionIdIsValid(ProcGlobal->xids[pgxactoff]));
Assert(ProcGlobal->xids[pgxactoff] == proc->xid);
ProcGlobal->xids[pgxactoff] = InvalidTransactionId;
proc->xid = InvalidTransactionId;
proc->lxid = InvalidLocalTransactionId;
proc->xmin = InvalidTransactionId;
/* be sure this is cleared in abort */
proc->delayChkptFlags = 0;
proc->recoveryConflictPending = false;
/* must be cleared with xid/xmin: */
/* avoid unnecessarily dirtying shared cachelines */
if (proc->statusFlags & PROC_VACUUM_STATE_MASK)
{
proc->statusFlags &= ~PROC_VACUUM_STATE_MASK;
ProcGlobal->statusFlags[proc->pgxactoff] = proc->statusFlags;
}
/* Clear the subtransaction-XID cache too while holding the lock */
Assert(ProcGlobal->subxidStates[pgxactoff].count == proc->subxidStatus.count &&
ProcGlobal->subxidStates[pgxactoff].overflowed == proc->subxidStatus.overflowed);
if (proc->subxidStatus.count > 0 || proc->subxidStatus.overflowed)
{
ProcGlobal->subxidStates[pgxactoff].count = 0;
ProcGlobal->subxidStates[pgxactoff].overflowed = false;
proc->subxidStatus.count = 0;
proc->subxidStatus.overflowed = false;
}
/* Also advance global latestCompletedXid while holding the lock */
MaintainLatestCompletedXid(latestXid);
/* Same with xactCompletionCount */
ShmemVariableCache->xactCompletionCount++;
}
/*
* ProcArrayGroupClearXid -- group XID clearing
*
* When we cannot immediately acquire ProcArrayLock in exclusive mode at
* commit time, add ourselves to a list of processes that need their XIDs
* cleared. The first process to add itself to the list will acquire
* ProcArrayLock in exclusive mode and perform ProcArrayEndTransactionInternal
* on behalf of all group members. This avoids a great deal of contention
* around ProcArrayLock when many processes are trying to commit at once,
* since the lock need not be repeatedly handed off from one committing
* process to the next.
*/
static void
ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid)
{
PROC_HDR *procglobal = ProcGlobal;
uint32 nextidx;
uint32 wakeidx;
/* We should definitely have an XID to clear. */
Assert(TransactionIdIsValid(proc->xid));
/* Add ourselves to the list of processes needing a group XID clear. */
proc->procArrayGroupMember = true;
proc->procArrayGroupMemberXid = latestXid;
nextidx = pg_atomic_read_u32(&procglobal->procArrayGroupFirst);
while (true)
{
pg_atomic_write_u32(&proc->procArrayGroupNext, nextidx);
if (pg_atomic_compare_exchange_u32(&procglobal->procArrayGroupFirst,
&nextidx,
(uint32) proc->pgprocno))
break;
}
/*
* If the list was not empty, the leader will clear our XID. It is
* impossible to have followers without a leader because the first process
* that has added itself to the list will always have nextidx as
* INVALID_PGPROCNO.
*/
if (nextidx != INVALID_PGPROCNO)
{
int extraWaits = 0;
/* Sleep until the leader clears our XID. */
pgstat_report_wait_start(WAIT_EVENT_PROCARRAY_GROUP_UPDATE);
for (;;)
{
/* acts as a read barrier */
PGSemaphoreLock(proc->sem);
if (!proc->procArrayGroupMember)
break;
extraWaits++;
}
pgstat_report_wait_end();
Assert(pg_atomic_read_u32(&proc->procArrayGroupNext) == INVALID_PGPROCNO);
/* Fix semaphore count for any absorbed wakeups */
while (extraWaits-- > 0)
PGSemaphoreUnlock(proc->sem);
return;
}
/* We are the leader. Acquire the lock on behalf of everyone. */
LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
/*
* Now that we've got the lock, clear the list of processes waiting for
* group XID clearing, saving a pointer to the head of the list. Trying
* to pop elements one at a time could lead to an ABA problem.
*/
nextidx = pg_atomic_exchange_u32(&procglobal->procArrayGroupFirst,
INVALID_PGPROCNO);
/* Remember head of list so we can perform wakeups after dropping lock. */
wakeidx = nextidx;
/* Walk the list and clear all XIDs. */
while (nextidx != INVALID_PGPROCNO)
{
PGPROC *nextproc = &allProcs[nextidx];
ProcArrayEndTransactionInternal(nextproc, nextproc->procArrayGroupMemberXid);
/* Move to next proc in list. */
nextidx = pg_atomic_read_u32(&nextproc->procArrayGroupNext);
}
/* We're done with the lock now. */
LWLockRelease(ProcArrayLock);
/*
* Now that we've released the lock, go back and wake everybody up. We
* don't do this under the lock so as to keep lock hold times to a
* minimum. The system calls we need to perform to wake other processes
* up are probably much slower than the simple memory writes we did while
* holding the lock.
*/
while (wakeidx != INVALID_PGPROCNO)
{
PGPROC *nextproc = &allProcs[wakeidx];
wakeidx = pg_atomic_read_u32(&nextproc->procArrayGroupNext);
pg_atomic_write_u32(&nextproc->procArrayGroupNext, INVALID_PGPROCNO);
/* ensure all previous writes are visible before follower continues. */
pg_write_barrier();
nextproc->procArrayGroupMember = false;
if (nextproc != MyProc)
PGSemaphoreUnlock(nextproc->sem);
}
}
/*
* ProcArrayClearTransaction -- clear the transaction fields
*
* This is used after successfully preparing a 2-phase transaction. We are
* not actually reporting the transaction's XID as no longer running --- it
* will still appear as running because the 2PC's gxact is in the ProcArray
* too. We just have to clear out our own PGPROC.
*/
void
ProcArrayClearTransaction(PGPROC *proc)
{
int pgxactoff;
/*
* Currently we need to lock ProcArrayLock exclusively here, as we
* increment xactCompletionCount below. We also need it at least in shared
* mode for pgproc->pgxactoff to stay the same below.
*
* We could however, as this action does not actually change anyone's view
* of the set of running XIDs (our entry is duplicate with the gxact that
* has already been inserted into the ProcArray), lower the lock level to
* shared if we were to make xactCompletionCount an atomic variable. But
* that doesn't seem worth it currently, as a 2PC commit is heavyweight
* enough for this not to be the bottleneck. If it ever becomes a
* bottleneck it may also be worth considering to combine this with the
* subsequent ProcArrayRemove()
*/
LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
pgxactoff = proc->pgxactoff;
ProcGlobal->xids[pgxactoff] = InvalidTransactionId;
proc->xid = InvalidTransactionId;
proc->lxid = InvalidLocalTransactionId;
proc->xmin = InvalidTransactionId;
proc->recoveryConflictPending = false;
Assert(!(proc->statusFlags & PROC_VACUUM_STATE_MASK));
Assert(!proc->delayChkptFlags);
/*
* Need to increment completion count even though transaction hasn't
* really committed yet. The reason for that is that GetSnapshotData()
* omits the xid of the current transaction, thus without the increment we
* otherwise could end up reusing the snapshot later. Which would be bad,
* because it might not count the prepared transaction as running.
*/
ShmemVariableCache->xactCompletionCount++;
/* Clear the subtransaction-XID cache too */
Assert(ProcGlobal->subxidStates[pgxactoff].count == proc->subxidStatus.count &&
ProcGlobal->subxidStates[pgxactoff].overflowed == proc->subxidStatus.overflowed);
if (proc->subxidStatus.count > 0 || proc->subxidStatus.overflowed)
{
ProcGlobal->subxidStates[pgxactoff].count = 0;
ProcGlobal->subxidStates[pgxactoff].overflowed = false;
proc->subxidStatus.count = 0;
proc->subxidStatus.overflowed = false;
}
LWLockRelease(ProcArrayLock);
}
/*
* Update ShmemVariableCache->latestCompletedXid to point to latestXid if
* currently older.
*/
static void
MaintainLatestCompletedXid(TransactionId latestXid)
{
FullTransactionId cur_latest = ShmemVariableCache->latestCompletedXid;
Assert(FullTransactionIdIsValid(cur_latest));
Assert(!RecoveryInProgress());
Assert(LWLockHeldByMe(ProcArrayLock));
if (TransactionIdPrecedes(XidFromFullTransactionId(cur_latest), latestXid))
{
ShmemVariableCache->latestCompletedXid =
FullXidRelativeTo(cur_latest, latestXid);
}
Assert(IsBootstrapProcessingMode() ||
FullTransactionIdIsNormal(ShmemVariableCache->latestCompletedXid));
}
/*
* Same as MaintainLatestCompletedXid, except for use during WAL replay.
*/
static void
MaintainLatestCompletedXidRecovery(TransactionId latestXid)
{
FullTransactionId cur_latest = ShmemVariableCache->latestCompletedXid;
FullTransactionId rel;
Assert(AmStartupProcess() || !IsUnderPostmaster);
Assert(LWLockHeldByMe(ProcArrayLock));
/*
* Need a FullTransactionId to compare latestXid with. Can't rely on