-
Notifications
You must be signed in to change notification settings - Fork 4.5k
/
pgoutput.c
2380 lines (2058 loc) · 69.8 KB
/
pgoutput.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*-------------------------------------------------------------------------
*
* pgoutput.c
* Logical Replication output plugin
*
* Copyright (c) 2012-2024, PostgreSQL Global Development Group
*
* IDENTIFICATION
* src/backend/replication/pgoutput/pgoutput.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/tupconvert.h"
#include "catalog/partition.h"
#include "catalog/pg_publication.h"
#include "catalog/pg_publication_rel.h"
#include "catalog/pg_subscription.h"
#include "commands/defrem.h"
#include "commands/subscriptioncmds.h"
#include "executor/executor.h"
#include "fmgr.h"
#include "nodes/makefuncs.h"
#include "parser/parse_relation.h"
#include "replication/logical.h"
#include "replication/logicalproto.h"
#include "replication/origin.h"
#include "replication/pgoutput.h"
#include "utils/builtins.h"
#include "utils/inval.h"
#include "utils/lsyscache.h"
#include "utils/memutils.h"
#include "utils/rel.h"
#include "utils/syscache.h"
#include "utils/varlena.h"
PG_MODULE_MAGIC;
static void pgoutput_startup(LogicalDecodingContext *ctx,
OutputPluginOptions *opt, bool is_init);
static void pgoutput_shutdown(LogicalDecodingContext *ctx);
static void pgoutput_begin_txn(LogicalDecodingContext *ctx,
ReorderBufferTXN *txn);
static void pgoutput_commit_txn(LogicalDecodingContext *ctx,
ReorderBufferTXN *txn, XLogRecPtr commit_lsn);
static void pgoutput_change(LogicalDecodingContext *ctx,
ReorderBufferTXN *txn, Relation relation,
ReorderBufferChange *change);
static void pgoutput_truncate(LogicalDecodingContext *ctx,
ReorderBufferTXN *txn, int nrelations, Relation relations[],
ReorderBufferChange *change);
static void pgoutput_message(LogicalDecodingContext *ctx,
ReorderBufferTXN *txn, XLogRecPtr message_lsn,
bool transactional, const char *prefix,
Size sz, const char *message);
static bool pgoutput_origin_filter(LogicalDecodingContext *ctx,
RepOriginId origin_id);
static void pgoutput_begin_prepare_txn(LogicalDecodingContext *ctx,
ReorderBufferTXN *txn);
static void pgoutput_prepare_txn(LogicalDecodingContext *ctx,
ReorderBufferTXN *txn, XLogRecPtr prepare_lsn);
static void pgoutput_commit_prepared_txn(LogicalDecodingContext *ctx,
ReorderBufferTXN *txn, XLogRecPtr commit_lsn);
static void pgoutput_rollback_prepared_txn(LogicalDecodingContext *ctx,
ReorderBufferTXN *txn,
XLogRecPtr prepare_end_lsn,
TimestampTz prepare_time);
static void pgoutput_stream_start(struct LogicalDecodingContext *ctx,
ReorderBufferTXN *txn);
static void pgoutput_stream_stop(struct LogicalDecodingContext *ctx,
ReorderBufferTXN *txn);
static void pgoutput_stream_abort(struct LogicalDecodingContext *ctx,
ReorderBufferTXN *txn,
XLogRecPtr abort_lsn);
static void pgoutput_stream_commit(struct LogicalDecodingContext *ctx,
ReorderBufferTXN *txn,
XLogRecPtr commit_lsn);
static void pgoutput_stream_prepare_txn(LogicalDecodingContext *ctx,
ReorderBufferTXN *txn, XLogRecPtr prepare_lsn);
static bool publications_valid;
static List *LoadPublications(List *pubnames);
static void publication_invalidation_cb(Datum arg, int cacheid,
uint32 hashvalue);
static void send_relation_and_attrs(Relation relation, TransactionId xid,
LogicalDecodingContext *ctx,
Bitmapset *columns);
static void send_repl_origin(LogicalDecodingContext *ctx,
RepOriginId origin_id, XLogRecPtr origin_lsn,
bool send_origin);
/*
* Only 3 publication actions are used for row filtering ("insert", "update",
* "delete"). See RelationSyncEntry.exprstate[].
*/
enum RowFilterPubAction
{
PUBACTION_INSERT,
PUBACTION_UPDATE,
PUBACTION_DELETE,
};
#define NUM_ROWFILTER_PUBACTIONS (PUBACTION_DELETE+1)
/*
* Entry in the map used to remember which relation schemas we sent.
*
* The schema_sent flag determines if the current schema record for the
* relation (and for its ancestor if publish_as_relid is set) was already
* sent to the subscriber (in which case we don't need to send it again).
*
* The schema cache on downstream is however updated only at commit time,
* and with streamed transactions the commit order may be different from
* the order the transactions are sent in. Also, the (sub) transactions
* might get aborted so we need to send the schema for each (sub) transaction
* so that we don't lose the schema information on abort. For handling this,
* we maintain the list of xids (streamed_txns) for those we have already sent
* the schema.
*
* For partitions, 'pubactions' considers not only the table's own
* publications, but also those of all of its ancestors.
*/
typedef struct RelationSyncEntry
{
Oid relid; /* relation oid */
bool replicate_valid; /* overall validity flag for entry */
bool schema_sent;
List *streamed_txns; /* streamed toplevel transactions with this
* schema */
/* are we publishing this rel? */
PublicationActions pubactions;
/*
* ExprState array for row filter. Different publication actions don't
* allow multiple expressions to always be combined into one, because
* updates or deletes restrict the column in expression to be part of the
* replica identity index whereas inserts do not have this restriction, so
* there is one ExprState per publication action.
*/
ExprState *exprstate[NUM_ROWFILTER_PUBACTIONS];
EState *estate; /* executor state used for row filter */
TupleTableSlot *new_slot; /* slot for storing new tuple */
TupleTableSlot *old_slot; /* slot for storing old tuple */
/*
* OID of the relation to publish changes as. For a partition, this may
* be set to one of its ancestors whose schema will be used when
* replicating changes, if publish_via_partition_root is set for the
* publication.
*/
Oid publish_as_relid;
/*
* Map used when replicating using an ancestor's schema to convert tuples
* from partition's type to the ancestor's; NULL if publish_as_relid is
* same as 'relid' or if unnecessary due to partition and the ancestor
* having identical TupleDesc.
*/
AttrMap *attrmap;
/*
* Columns included in the publication, or NULL if all columns are
* included implicitly. Note that the attnums in this bitmap are not
* shifted by FirstLowInvalidHeapAttributeNumber.
*/
Bitmapset *columns;
/*
* Private context to store additional data for this entry - state for the
* row filter expressions, column list, etc.
*/
MemoryContext entry_cxt;
} RelationSyncEntry;
/*
* Maintain a per-transaction level variable to track whether the transaction
* has sent BEGIN. BEGIN is only sent when the first change in a transaction
* is processed. This makes it possible to skip sending a pair of BEGIN/COMMIT
* messages for empty transactions which saves network bandwidth.
*
* This optimization is not used for prepared transactions because if the
* WALSender restarts after prepare of a transaction and before commit prepared
* of the same transaction then we won't be able to figure out if we have
* skipped sending BEGIN/PREPARE of a transaction as it was empty. This is
* because we would have lost the in-memory txndata information that was
* present prior to the restart. This will result in sending a spurious
* COMMIT PREPARED without a corresponding prepared transaction at the
* downstream which would lead to an error when it tries to process it.
*
* XXX We could achieve this optimization by changing protocol to send
* additional information so that downstream can detect that the corresponding
* prepare has not been sent. However, adding such a check for every
* transaction in the downstream could be costly so we might want to do it
* optionally.
*
* We also don't have this optimization for streamed transactions because
* they can contain prepared transactions.
*/
typedef struct PGOutputTxnData
{
bool sent_begin_txn; /* flag indicating whether BEGIN has been sent */
} PGOutputTxnData;
/* Map used to remember which relation schemas we sent. */
static HTAB *RelationSyncCache = NULL;
static void init_rel_sync_cache(MemoryContext cachectx);
static void cleanup_rel_sync_cache(TransactionId xid, bool is_commit);
static RelationSyncEntry *get_rel_sync_entry(PGOutputData *data,
Relation relation);
static void rel_sync_cache_relation_cb(Datum arg, Oid relid);
static void rel_sync_cache_publication_cb(Datum arg, int cacheid,
uint32 hashvalue);
static void set_schema_sent_in_streamed_txn(RelationSyncEntry *entry,
TransactionId xid);
static bool get_schema_sent_in_streamed_txn(RelationSyncEntry *entry,
TransactionId xid);
static void init_tuple_slot(PGOutputData *data, Relation relation,
RelationSyncEntry *entry);
/* row filter routines */
static EState *create_estate_for_relation(Relation rel);
static void pgoutput_row_filter_init(PGOutputData *data,
List *publications,
RelationSyncEntry *entry);
static bool pgoutput_row_filter_exec_expr(ExprState *state,
ExprContext *econtext);
static bool pgoutput_row_filter(Relation relation, TupleTableSlot *old_slot,
TupleTableSlot **new_slot_ptr,
RelationSyncEntry *entry,
ReorderBufferChangeType *action);
/* column list routines */
static void pgoutput_column_list_init(PGOutputData *data,
List *publications,
RelationSyncEntry *entry);
/*
* Specify output plugin callbacks
*/
void
_PG_output_plugin_init(OutputPluginCallbacks *cb)
{
cb->startup_cb = pgoutput_startup;
cb->begin_cb = pgoutput_begin_txn;
cb->change_cb = pgoutput_change;
cb->truncate_cb = pgoutput_truncate;
cb->message_cb = pgoutput_message;
cb->commit_cb = pgoutput_commit_txn;
cb->begin_prepare_cb = pgoutput_begin_prepare_txn;
cb->prepare_cb = pgoutput_prepare_txn;
cb->commit_prepared_cb = pgoutput_commit_prepared_txn;
cb->rollback_prepared_cb = pgoutput_rollback_prepared_txn;
cb->filter_by_origin_cb = pgoutput_origin_filter;
cb->shutdown_cb = pgoutput_shutdown;
/* transaction streaming */
cb->stream_start_cb = pgoutput_stream_start;
cb->stream_stop_cb = pgoutput_stream_stop;
cb->stream_abort_cb = pgoutput_stream_abort;
cb->stream_commit_cb = pgoutput_stream_commit;
cb->stream_change_cb = pgoutput_change;
cb->stream_message_cb = pgoutput_message;
cb->stream_truncate_cb = pgoutput_truncate;
/* transaction streaming - two-phase commit */
cb->stream_prepare_cb = pgoutput_stream_prepare_txn;
}
static void
parse_output_parameters(List *options, PGOutputData *data)
{
ListCell *lc;
bool protocol_version_given = false;
bool publication_names_given = false;
bool binary_option_given = false;
bool messages_option_given = false;
bool streaming_given = false;
bool two_phase_option_given = false;
bool origin_option_given = false;
data->binary = false;
data->streaming = LOGICALREP_STREAM_OFF;
data->messages = false;
data->two_phase = false;
foreach(lc, options)
{
DefElem *defel = (DefElem *) lfirst(lc);
Assert(defel->arg == NULL || IsA(defel->arg, String));
/* Check each param, whether or not we recognize it */
if (strcmp(defel->defname, "proto_version") == 0)
{
unsigned long parsed;
char *endptr;
if (protocol_version_given)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("conflicting or redundant options")));
protocol_version_given = true;
errno = 0;
parsed = strtoul(strVal(defel->arg), &endptr, 10);
if (errno != 0 || *endptr != '\0')
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid proto_version")));
if (parsed > PG_UINT32_MAX)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("proto_version \"%s\" out of range",
strVal(defel->arg))));
data->protocol_version = (uint32) parsed;
}
else if (strcmp(defel->defname, "publication_names") == 0)
{
if (publication_names_given)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("conflicting or redundant options")));
publication_names_given = true;
if (!SplitIdentifierString(strVal(defel->arg), ',',
&data->publication_names))
ereport(ERROR,
(errcode(ERRCODE_INVALID_NAME),
errmsg("invalid publication_names syntax")));
}
else if (strcmp(defel->defname, "binary") == 0)
{
if (binary_option_given)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("conflicting or redundant options")));
binary_option_given = true;
data->binary = defGetBoolean(defel);
}
else if (strcmp(defel->defname, "messages") == 0)
{
if (messages_option_given)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("conflicting or redundant options")));
messages_option_given = true;
data->messages = defGetBoolean(defel);
}
else if (strcmp(defel->defname, "streaming") == 0)
{
if (streaming_given)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("conflicting or redundant options")));
streaming_given = true;
data->streaming = defGetStreamingMode(defel);
}
else if (strcmp(defel->defname, "two_phase") == 0)
{
if (two_phase_option_given)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("conflicting or redundant options")));
two_phase_option_given = true;
data->two_phase = defGetBoolean(defel);
}
else if (strcmp(defel->defname, "origin") == 0)
{
char *origin;
if (origin_option_given)
ereport(ERROR,
errcode(ERRCODE_SYNTAX_ERROR),
errmsg("conflicting or redundant options"));
origin_option_given = true;
origin = defGetString(defel);
if (pg_strcasecmp(origin, LOGICALREP_ORIGIN_NONE) == 0)
data->publish_no_origin = true;
else if (pg_strcasecmp(origin, LOGICALREP_ORIGIN_ANY) == 0)
data->publish_no_origin = false;
else
ereport(ERROR,
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("unrecognized origin value: \"%s\"", origin));
}
else
elog(ERROR, "unrecognized pgoutput option: %s", defel->defname);
}
/* Check required options */
if (!protocol_version_given)
ereport(ERROR,
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("option \"%s\" missing", "proto_version"));
if (!publication_names_given)
ereport(ERROR,
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("option \"%s\" missing", "publication_names"));
}
/*
* Initialize this plugin
*/
static void
pgoutput_startup(LogicalDecodingContext *ctx, OutputPluginOptions *opt,
bool is_init)
{
PGOutputData *data = palloc0(sizeof(PGOutputData));
static bool publication_callback_registered = false;
/* Create our memory context for private allocations. */
data->context = AllocSetContextCreate(ctx->context,
"logical replication output context",
ALLOCSET_DEFAULT_SIZES);
data->cachectx = AllocSetContextCreate(ctx->context,
"logical replication cache context",
ALLOCSET_DEFAULT_SIZES);
ctx->output_plugin_private = data;
/* This plugin uses binary protocol. */
opt->output_type = OUTPUT_PLUGIN_BINARY_OUTPUT;
/*
* This is replication start and not slot initialization.
*
* Parse and validate options passed by the client.
*/
if (!is_init)
{
/* Parse the params and ERROR if we see any we don't recognize */
parse_output_parameters(ctx->output_plugin_options, data);
/* Check if we support requested protocol */
if (data->protocol_version > LOGICALREP_PROTO_MAX_VERSION_NUM)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("client sent proto_version=%d but server only supports protocol %d or lower",
data->protocol_version, LOGICALREP_PROTO_MAX_VERSION_NUM)));
if (data->protocol_version < LOGICALREP_PROTO_MIN_VERSION_NUM)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("client sent proto_version=%d but server only supports protocol %d or higher",
data->protocol_version, LOGICALREP_PROTO_MIN_VERSION_NUM)));
/*
* Decide whether to enable streaming. It is disabled by default, in
* which case we just update the flag in decoding context. Otherwise
* we only allow it with sufficient version of the protocol, and when
* the output plugin supports it.
*/
if (data->streaming == LOGICALREP_STREAM_OFF)
ctx->streaming = false;
else if (data->streaming == LOGICALREP_STREAM_ON &&
data->protocol_version < LOGICALREP_PROTO_STREAM_VERSION_NUM)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("requested proto_version=%d does not support streaming, need %d or higher",
data->protocol_version, LOGICALREP_PROTO_STREAM_VERSION_NUM)));
else if (data->streaming == LOGICALREP_STREAM_PARALLEL &&
data->protocol_version < LOGICALREP_PROTO_STREAM_PARALLEL_VERSION_NUM)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("requested proto_version=%d does not support parallel streaming, need %d or higher",
data->protocol_version, LOGICALREP_PROTO_STREAM_PARALLEL_VERSION_NUM)));
else if (!ctx->streaming)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("streaming requested, but not supported by output plugin")));
/*
* Here, we just check whether the two-phase option is passed by
* plugin and decide whether to enable it at later point of time. It
* remains enabled if the previous start-up has done so. But we only
* allow the option to be passed in with sufficient version of the
* protocol, and when the output plugin supports it.
*/
if (!data->two_phase)
ctx->twophase_opt_given = false;
else if (data->protocol_version < LOGICALREP_PROTO_TWOPHASE_VERSION_NUM)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("requested proto_version=%d does not support two-phase commit, need %d or higher",
data->protocol_version, LOGICALREP_PROTO_TWOPHASE_VERSION_NUM)));
else if (!ctx->twophase)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("two-phase commit requested, but not supported by output plugin")));
else
ctx->twophase_opt_given = true;
/* Init publication state. */
data->publications = NIL;
publications_valid = false;
/*
* Register callback for pg_publication if we didn't already do that
* during some previous call in this process.
*/
if (!publication_callback_registered)
{
CacheRegisterSyscacheCallback(PUBLICATIONOID,
publication_invalidation_cb,
(Datum) 0);
publication_callback_registered = true;
}
/* Initialize relation schema cache. */
init_rel_sync_cache(CacheMemoryContext);
}
else
{
/*
* Disable the streaming and prepared transactions during the slot
* initialization mode.
*/
ctx->streaming = false;
ctx->twophase = false;
}
}
/*
* BEGIN callback.
*
* Don't send the BEGIN message here instead postpone it until the first
* change. In logical replication, a common scenario is to replicate a set of
* tables (instead of all tables) and transactions whose changes were on
* the table(s) that are not published will produce empty transactions. These
* empty transactions will send BEGIN and COMMIT messages to subscribers,
* using bandwidth on something with little/no use for logical replication.
*/
static void
pgoutput_begin_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn)
{
PGOutputTxnData *txndata = MemoryContextAllocZero(ctx->context,
sizeof(PGOutputTxnData));
txn->output_plugin_private = txndata;
}
/*
* Send BEGIN.
*
* This is called while processing the first change of the transaction.
*/
static void
pgoutput_send_begin(LogicalDecodingContext *ctx, ReorderBufferTXN *txn)
{
bool send_replication_origin = txn->origin_id != InvalidRepOriginId;
PGOutputTxnData *txndata = (PGOutputTxnData *) txn->output_plugin_private;
Assert(txndata);
Assert(!txndata->sent_begin_txn);
OutputPluginPrepareWrite(ctx, !send_replication_origin);
logicalrep_write_begin(ctx->out, txn);
txndata->sent_begin_txn = true;
send_repl_origin(ctx, txn->origin_id, txn->origin_lsn,
send_replication_origin);
OutputPluginWrite(ctx, true);
}
/*
* COMMIT callback
*/
static void
pgoutput_commit_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
XLogRecPtr commit_lsn)
{
PGOutputTxnData *txndata = (PGOutputTxnData *) txn->output_plugin_private;
bool sent_begin_txn;
Assert(txndata);
/*
* We don't need to send the commit message unless some relevant change
* from this transaction has been sent to the downstream.
*/
sent_begin_txn = txndata->sent_begin_txn;
OutputPluginUpdateProgress(ctx, !sent_begin_txn);
pfree(txndata);
txn->output_plugin_private = NULL;
if (!sent_begin_txn)
{
elog(DEBUG1, "skipped replication of an empty transaction with XID: %u", txn->xid);
return;
}
OutputPluginPrepareWrite(ctx, true);
logicalrep_write_commit(ctx->out, txn, commit_lsn);
OutputPluginWrite(ctx, true);
}
/*
* BEGIN PREPARE callback
*/
static void
pgoutput_begin_prepare_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn)
{
bool send_replication_origin = txn->origin_id != InvalidRepOriginId;
OutputPluginPrepareWrite(ctx, !send_replication_origin);
logicalrep_write_begin_prepare(ctx->out, txn);
send_repl_origin(ctx, txn->origin_id, txn->origin_lsn,
send_replication_origin);
OutputPluginWrite(ctx, true);
}
/*
* PREPARE callback
*/
static void
pgoutput_prepare_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
XLogRecPtr prepare_lsn)
{
OutputPluginUpdateProgress(ctx, false);
OutputPluginPrepareWrite(ctx, true);
logicalrep_write_prepare(ctx->out, txn, prepare_lsn);
OutputPluginWrite(ctx, true);
}
/*
* COMMIT PREPARED callback
*/
static void
pgoutput_commit_prepared_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
XLogRecPtr commit_lsn)
{
OutputPluginUpdateProgress(ctx, false);
OutputPluginPrepareWrite(ctx, true);
logicalrep_write_commit_prepared(ctx->out, txn, commit_lsn);
OutputPluginWrite(ctx, true);
}
/*
* ROLLBACK PREPARED callback
*/
static void
pgoutput_rollback_prepared_txn(LogicalDecodingContext *ctx,
ReorderBufferTXN *txn,
XLogRecPtr prepare_end_lsn,
TimestampTz prepare_time)
{
OutputPluginUpdateProgress(ctx, false);
OutputPluginPrepareWrite(ctx, true);
logicalrep_write_rollback_prepared(ctx->out, txn, prepare_end_lsn,
prepare_time);
OutputPluginWrite(ctx, true);
}
/*
* Write the current schema of the relation and its ancestor (if any) if not
* done yet.
*/
static void
maybe_send_schema(LogicalDecodingContext *ctx,
ReorderBufferChange *change,
Relation relation, RelationSyncEntry *relentry)
{
PGOutputData *data = (PGOutputData *) ctx->output_plugin_private;
bool schema_sent;
TransactionId xid = InvalidTransactionId;
TransactionId topxid = InvalidTransactionId;
/*
* Remember XID of the (sub)transaction for the change. We don't care if
* it's top-level transaction or not (we have already sent that XID in
* start of the current streaming block).
*
* If we're not in a streaming block, just use InvalidTransactionId and
* the write methods will not include it.
*/
if (data->in_streaming)
xid = change->txn->xid;
if (rbtxn_is_subtxn(change->txn))
topxid = rbtxn_get_toptxn(change->txn)->xid;
else
topxid = xid;
/*
* Do we need to send the schema? We do track streamed transactions
* separately, because those may be applied later (and the regular
* transactions won't see their effects until then) and in an order that
* we don't know at this point.
*
* XXX There is a scope of optimization here. Currently, we always send
* the schema first time in a streaming transaction but we can probably
* avoid that by checking 'relentry->schema_sent' flag. However, before
* doing that we need to study its impact on the case where we have a mix
* of streaming and non-streaming transactions.
*/
if (data->in_streaming)
schema_sent = get_schema_sent_in_streamed_txn(relentry, topxid);
else
schema_sent = relentry->schema_sent;
/* Nothing to do if we already sent the schema. */
if (schema_sent)
return;
/*
* Send the schema. If the changes will be published using an ancestor's
* schema, not the relation's own, send that ancestor's schema before
* sending relation's own (XXX - maybe sending only the former suffices?).
*/
if (relentry->publish_as_relid != RelationGetRelid(relation))
{
Relation ancestor = RelationIdGetRelation(relentry->publish_as_relid);
send_relation_and_attrs(ancestor, xid, ctx, relentry->columns);
RelationClose(ancestor);
}
send_relation_and_attrs(relation, xid, ctx, relentry->columns);
if (data->in_streaming)
set_schema_sent_in_streamed_txn(relentry, topxid);
else
relentry->schema_sent = true;
}
/*
* Sends a relation
*/
static void
send_relation_and_attrs(Relation relation, TransactionId xid,
LogicalDecodingContext *ctx,
Bitmapset *columns)
{
TupleDesc desc = RelationGetDescr(relation);
int i;
/*
* Write out type info if needed. We do that only for user-created types.
* We use FirstGenbkiObjectId as the cutoff, so that we only consider
* objects with hand-assigned OIDs to be "built in", not for instance any
* function or type defined in the information_schema. This is important
* because only hand-assigned OIDs can be expected to remain stable across
* major versions.
*/
for (i = 0; i < desc->natts; i++)
{
Form_pg_attribute att = TupleDescAttr(desc, i);
if (att->attisdropped || att->attgenerated)
continue;
if (att->atttypid < FirstGenbkiObjectId)
continue;
/* Skip this attribute if it's not present in the column list */
if (columns != NULL && !bms_is_member(att->attnum, columns))
continue;
OutputPluginPrepareWrite(ctx, false);
logicalrep_write_typ(ctx->out, xid, att->atttypid);
OutputPluginWrite(ctx, false);
}
OutputPluginPrepareWrite(ctx, false);
logicalrep_write_rel(ctx->out, xid, relation, columns);
OutputPluginWrite(ctx, false);
}
/*
* Executor state preparation for evaluation of row filter expressions for the
* specified relation.
*/
static EState *
create_estate_for_relation(Relation rel)
{
EState *estate;
RangeTblEntry *rte;
List *perminfos = NIL;
estate = CreateExecutorState();
rte = makeNode(RangeTblEntry);
rte->rtekind = RTE_RELATION;
rte->relid = RelationGetRelid(rel);
rte->relkind = rel->rd_rel->relkind;
rte->rellockmode = AccessShareLock;
addRTEPermissionInfo(&perminfos, rte);
ExecInitRangeTable(estate, list_make1(rte), perminfos);
estate->es_output_cid = GetCurrentCommandId(false);
return estate;
}
/*
* Evaluates row filter.
*
* If the row filter evaluates to NULL, it is taken as false i.e. the change
* isn't replicated.
*/
static bool
pgoutput_row_filter_exec_expr(ExprState *state, ExprContext *econtext)
{
Datum ret;
bool isnull;
Assert(state != NULL);
ret = ExecEvalExprSwitchContext(state, econtext, &isnull);
elog(DEBUG3, "row filter evaluates to %s (isnull: %s)",
isnull ? "false" : DatumGetBool(ret) ? "true" : "false",
isnull ? "true" : "false");
if (isnull)
return false;
return DatumGetBool(ret);
}
/*
* Make sure the per-entry memory context exists.
*/
static void
pgoutput_ensure_entry_cxt(PGOutputData *data, RelationSyncEntry *entry)
{
Relation relation;
/* The context may already exist, in which case bail out. */
if (entry->entry_cxt)
return;
relation = RelationIdGetRelation(entry->publish_as_relid);
entry->entry_cxt = AllocSetContextCreate(data->cachectx,
"entry private context",
ALLOCSET_SMALL_SIZES);
MemoryContextCopyAndSetIdentifier(entry->entry_cxt,
RelationGetRelationName(relation));
}
/*
* Initialize the row filter.
*/
static void
pgoutput_row_filter_init(PGOutputData *data, List *publications,
RelationSyncEntry *entry)
{
ListCell *lc;
List *rfnodes[] = {NIL, NIL, NIL}; /* One per pubaction */
bool no_filter[] = {false, false, false}; /* One per pubaction */
MemoryContext oldctx;
int idx;
bool has_filter = true;
Oid schemaid = get_rel_namespace(entry->publish_as_relid);
/*
* Find if there are any row filters for this relation. If there are, then
* prepare the necessary ExprState and cache it in entry->exprstate. To
* build an expression state, we need to ensure the following:
*
* All the given publication-table mappings must be checked.
*
* Multiple publications might have multiple row filters for this
* relation. Since row filter usage depends on the DML operation, there
* are multiple lists (one for each operation) to which row filters will
* be appended.
*
* FOR ALL TABLES and FOR TABLES IN SCHEMA implies "don't use row filter
* expression" so it takes precedence.
*/
foreach(lc, publications)
{
Publication *pub = lfirst(lc);
HeapTuple rftuple = NULL;
Datum rfdatum = 0;
bool pub_no_filter = true;
/*
* If the publication is FOR ALL TABLES, or the publication includes a
* FOR TABLES IN SCHEMA where the table belongs to the referred
* schema, then it is treated the same as if there are no row filters
* (even if other publications have a row filter).
*/
if (!pub->alltables &&
!SearchSysCacheExists2(PUBLICATIONNAMESPACEMAP,
ObjectIdGetDatum(schemaid),
ObjectIdGetDatum(pub->oid)))
{
/*
* Check for the presence of a row filter in this publication.
*/
rftuple = SearchSysCache2(PUBLICATIONRELMAP,
ObjectIdGetDatum(entry->publish_as_relid),
ObjectIdGetDatum(pub->oid));
if (HeapTupleIsValid(rftuple))
{
/* Null indicates no filter. */
rfdatum = SysCacheGetAttr(PUBLICATIONRELMAP, rftuple,
Anum_pg_publication_rel_prqual,
&pub_no_filter);
}
}
if (pub_no_filter)
{
if (rftuple)
ReleaseSysCache(rftuple);
no_filter[PUBACTION_INSERT] |= pub->pubactions.pubinsert;
no_filter[PUBACTION_UPDATE] |= pub->pubactions.pubupdate;
no_filter[PUBACTION_DELETE] |= pub->pubactions.pubdelete;
/*
* Quick exit if all the DML actions are publicized via this
* publication.
*/
if (no_filter[PUBACTION_INSERT] &&
no_filter[PUBACTION_UPDATE] &&
no_filter[PUBACTION_DELETE])
{
has_filter = false;
break;
}
/* No additional work for this publication. Next one. */
continue;
}
/* Form the per pubaction row filter lists. */
if (pub->pubactions.pubinsert && !no_filter[PUBACTION_INSERT])
rfnodes[PUBACTION_INSERT] = lappend(rfnodes[PUBACTION_INSERT],
TextDatumGetCString(rfdatum));
if (pub->pubactions.pubupdate && !no_filter[PUBACTION_UPDATE])
rfnodes[PUBACTION_UPDATE] = lappend(rfnodes[PUBACTION_UPDATE],
TextDatumGetCString(rfdatum));
if (pub->pubactions.pubdelete && !no_filter[PUBACTION_DELETE])
rfnodes[PUBACTION_DELETE] = lappend(rfnodes[PUBACTION_DELETE],
TextDatumGetCString(rfdatum));
ReleaseSysCache(rftuple);
} /* loop all subscribed publications */
/* Clean the row filter */
for (idx = 0; idx < NUM_ROWFILTER_PUBACTIONS; idx++)
{
if (no_filter[idx])
{
list_free_deep(rfnodes[idx]);
rfnodes[idx] = NIL;
}
}
if (has_filter)
{
Relation relation = RelationIdGetRelation(entry->publish_as_relid);
pgoutput_ensure_entry_cxt(data, entry);
/*
* Now all the filters for all pubactions are known. Combine them when
* their pubactions are the same.
*/
oldctx = MemoryContextSwitchTo(entry->entry_cxt);
entry->estate = create_estate_for_relation(relation);
for (idx = 0; idx < NUM_ROWFILTER_PUBACTIONS; idx++)
{
List *filters = NIL;
Expr *rfnode;
if (rfnodes[idx] == NIL)
continue;
foreach(lc, rfnodes[idx])
filters = lappend(filters, stringToNode((char *) lfirst(lc)));