diff --git a/doc/src/sgml/backup.sgml b/doc/src/sgml/backup.sgml index 8cb24d6ae542f..b3468eea3cb99 100644 --- a/doc/src/sgml/backup.sgml +++ b/doc/src/sgml/backup.sgml @@ -857,12 +857,79 @@ test ! -f /mnt/server/archivedir/00000001000000A900000065 && cp pg_wal/0 + + Making an Incremental Backup + + + You can use to take an incremental + backup by specifying the --incremental option. You must + supply, as an argument to --incremental, the backup + manifest to an earlier backup from the same server. In the resulting + backup, non-relation files will be included in their entirety, but some + relation files may be replaced by smaller incremental files which contain + only the blocks which have been changed since the earlier backup and enough + metadata to reconstruct the current version of the file. + + + + To figure out which blocks need to be backed up, the server uses WAL + summaries, which are stored in the data directory, inside the directory + pg_wal/summaries. If the required summary files are not + present, an attempt to take an incremental backup will fail. The summaries + present in this directory must cover all LSNs from the start LSN of the + prior backup to the start LSN of the current backup. Since the server looks + for WAL summaries just after establishing the start LSN of the current + backup, the necessary summary files probably won't be instantly present + on disk, but the server will wait for any missing files to show up. + This also helps if the WAL summarization process has fallen behind. + However, if the necessary files have already been removed, or if the WAL + summarizer doesn't catch up quickly enough, the incremental backup will + fail. + + + + When restoring an incremental backup, it will be necessary to have not + only the incremental backup itself but also all earlier backups that + are required to supply the blocks omitted from the incremental backup. + See for further information about + this requirement. + + + + Note that all of the requirements for making use of a full backup also + apply to an incremental backup. For instance, you still need all of the + WAL segment files generated during and after the file system backup, and + any relevant WAL history files. And you still need to create a + recovery.signal (or standby.signal) + and perform recovery, as described in + . The requirement to have earlier + backups available at restore time and to use + pg_combinebackup is an additional requirement on top of + everything else. Keep in mind that PostgreSQL + has no built-in mechanism to figure out which backups are still needed as + a basis for restoring later incremental backups. You must keep track of + the relationships between your full and incremental backups on your own, + and be certain not to remove earlier backups if they might be needed when + restoring later incremental backups. + + + + Incremental backups typically only make sense for relatively large + databases where a significant portion of the data does not change, or only + changes slowly. For a small database, it's simpler to ignore the existence + of incremental backups and simply take full backups, which are simpler + to manage. For a large database all of which is heavily modified, + incremental backups won't be much smaller than full backups. + + + Making a Base Backup Using the Low Level API - The procedure for making a base backup using the low level - APIs contains a few more steps than - the method, but is relatively + Instead of taking a full or incremental base backup using + , you can take a base backup using the + low-level API. This procedure contains a few more steps than + the pg_basebackup method, but is relatively simple. It is very important that these steps are executed in sequence, and that the success of a step is verified before proceeding to the next step. @@ -1118,7 +1185,8 @@ SELECT * FROM pg_backup_stop(wait_for_archive => true); - Restore the database files from your file system backup. Be sure that they + If you're restoring a full backup, you can restore the database files + directly into the target directories. Be sure that they are restored with the right ownership (the database system user, not root!) and with the right permissions. If you are using tablespaces, @@ -1126,6 +1194,19 @@ SELECT * FROM pg_backup_stop(wait_for_archive => true); were correctly restored. + + + If you're restoring an incremental backup, you'll need to restore the + incremental backup and all earlier backups upon which it directly or + indirectly depends to the machine where you are performing the restore. + These backups will need to be placed in separate directories, not the + target directories where you want the running server to end up. + Once this is done, use to pull + data from the full backup and all of the subsequent incremental backups + and write out a synthetic full backup to the target directories. As above, + verify that permissions and tablespace links are correct. + + Remove any files present in pg_wal/; these came from the diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index ee985850275d0..b5624ca884741 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -4153,13 +4153,11 @@ restore_command = 'copy "C:\\server\\archivedir\\%f" "%p"' # Windows WAL Summarization - diff --git a/doc/src/sgml/protocol.sgml b/doc/src/sgml/protocol.sgml index af3f016f74677..9a66918171a56 100644 --- a/doc/src/sgml/protocol.sgml +++ b/doc/src/sgml/protocol.sgml @@ -2599,6 +2599,19 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;" + + + UPLOAD_MANIFEST + UPLOAD_MANIFEST + + + + Uploads a backup manifest in preparation for taking an incremental + backup. + + + + BASE_BACKUP [ ( option [, ...] ) ] BASE_BACKUP @@ -2838,6 +2851,17 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;" + + + INCREMENTAL + + + Requests an incremental backup. The + UPLOAD_MANIFEST command must be executed + before running a base backup with this option. + + + diff --git a/doc/src/sgml/ref/allfiles.sgml b/doc/src/sgml/ref/allfiles.sgml index 54b5f22d6ec9f..fda4690eab52c 100644 --- a/doc/src/sgml/ref/allfiles.sgml +++ b/doc/src/sgml/ref/allfiles.sgml @@ -202,6 +202,7 @@ Complete list of usable sgml source files in this directory. + diff --git a/doc/src/sgml/ref/pg_basebackup.sgml b/doc/src/sgml/ref/pg_basebackup.sgml index 0b87fd2d4d621..7c183a5cfd20e 100644 --- a/doc/src/sgml/ref/pg_basebackup.sgml +++ b/doc/src/sgml/ref/pg_basebackup.sgml @@ -38,11 +38,25 @@ PostgreSQL documentation - pg_basebackup makes an exact copy of the database - cluster's files, while making sure the server is put into and - out of backup mode automatically. Backups are always taken of the entire - database cluster; it is not possible to back up individual databases or - database objects. For selective backups, another tool such as + pg_basebackup can take a full or incremental + base backup of the database. When used to take a full backup, it makes an + exact copy of the database cluster's files. When used to take an incremental + backup, some files that would have been part of a full backup may be + replaced with incremental versions of the same files, containing only those + blocks that have been modified since the reference backup. An incremental + backup cannot be used directly; instead, + must first + be used to combine it with the previous backups upon which it depends. + See for more information + about incremental backups, and + for steps to recover from a backup. + + + + In any mode, pg_basebackup makes sure the server + is put into and out of backup mode automatically. Backups are always taken of + the entire database cluster; it is not possible to back up individual + databases or database objects. For selective backups, another tool such as must be used. @@ -197,6 +211,19 @@ PostgreSQL documentation + + + + + + Performs an incremental + backup. The backup manifest for the reference + backup must be provided, and will be uploaded to the server, which will + respond by sending the requested incremental backup. + + + + diff --git a/doc/src/sgml/ref/pg_combinebackup.sgml b/doc/src/sgml/ref/pg_combinebackup.sgml new file mode 100644 index 0000000000000..e1729671a5d42 --- /dev/null +++ b/doc/src/sgml/ref/pg_combinebackup.sgml @@ -0,0 +1,240 @@ + + + + + pg_combinebackup + + + + pg_combinebackup + 1 + Application + + + + pg_combinebackup + reconstruct a full backup from an incremental backup and dependent backups + + + + + pg_combinebackup + option + backup_directory + + + + + Description + + pg_combinebackup is used to reconstruct a + synthetic full backup from an + incremental backup and the + earlier backups upon which it depends. + + + + Specify all of the required backups on the command line from oldest to newest. + That is, the first backup directory should be the path to the full backup, and + the last should be the path to the final incremental backup + that you wish to restore. The reconstructed backup will be written to the + output directory specified by the option. + + + + Although pg_combinebackup will attempt to verify + that the backups you specify form a legal backup chain from which a correct + full backup can be reconstructed, it is not designed to help you keep track + of which backups depend on which other backups. If you remove the one or + more of the previous backups upon which your incremental + backup relies, you will not be able to restore it. + + + + Since the output of pg_combinebackup is a + synthetic full backup, it can be used as an input to a future invocation of + pg_combinebackup. The synthetic full backup would + be specified on the command line in lieu of the chain of backups from which + it was reconstructed. + + + + + Options + + + + + + + + + Print lots of debug logging output on stderr. + + + + + + + + + + The / option instructs + pg_cominebackup to figure out what would be done + without actually creating the target directory or any output files. + It is particularly useful in comination with . + + + + + + + + + + By default, pg_combinebackup will wait for all files + to be written safely to disk. This option causes + pg_combinebackup to return without waiting, which is + faster, but means that a subsequent operating system crash can leave + the output backup corrupt. Generally, this option is useful for testing + but should not be used when creating a production installation. + + + + + + + + + + Specifies the output directory to which the synthetic full backup + should be written. Currently, this argument is required. + + + + + + + + + + Relocates the tablespace in directory olddir + to newdir during the backup. + olddir is the absolute path of the tablespace + as it exists in the first backup specified on the command line, + and newdir is the absolute path to use for the + tablespace in the reconstructed backup. If either path needs to contain + an equal sign (=), precede that with a backslash. + This option can be specified multiple times for multiple tablespaces. + + + + + + + + + Like , + pg_combinebackup writes a backup manifest + in the output directory. This option specifies the checksum algorithm + that should be applied to each file included in the backup manifest. + Currently, the available algorithms are NONE, + CRC32C, SHA224, + SHA256, SHA384, + and SHA512. The default is CRC32C. + + + + + + + + + Disables generation of a backup manifest. If this option is not + specified, a backup manifest for the reconstructed backup will be + written to the output directory. + + + + + + + + + When set to fsync, which is the default, + pg_combinebackup will recursively open and synchronize + all files in the backup directory. When the plain format is used, the + search for files will follow symbolic links for the WAL directory and + each configured tablespace. + + + On Linux, syncfs may be used instead to ask the + operating system to synchronize the whole file system that contains the + backup directory. When the plain format is used, + pg_combinebackup will also synchronize the file systems + that contain the WAL files and each tablespace. See + for more information about using + syncfs(). + + + This option has no effect when is used. + + + + + + + + + + Prints the pg_combinebackup version and + exits. + + + + + + + + + + Shows help about pg_combinebackup command + line arguments, and exits. + + + + + + + + + + + Environment + + + This utility, like most other PostgreSQL utilities, + uses the environment variables supported by libpq + (see ). + + + + The environment variable PG_COLOR specifies whether to use + color in diagnostic messages. Possible values are + always, auto and + never. + + + + + See Also + + + + + + + diff --git a/doc/src/sgml/reference.sgml b/doc/src/sgml/reference.sgml index e11b4b6130753..a07d2b5e01e60 100644 --- a/doc/src/sgml/reference.sgml +++ b/doc/src/sgml/reference.sgml @@ -250,6 +250,7 @@ &pgamcheck; &pgBasebackup; &pgbench; + &pgCombinebackup; &pgConfig; &pgDump; &pgDumpall; diff --git a/src/backend/access/transam/xlogbackup.c b/src/backend/access/transam/xlogbackup.c index 21d68133ae106..f51d4282bb8dd 100644 --- a/src/backend/access/transam/xlogbackup.c +++ b/src/backend/access/transam/xlogbackup.c @@ -77,6 +77,16 @@ build_backup_content(BackupState *state, bool ishistoryfile) appendStringInfo(result, "STOP TIMELINE: %u\n", state->stoptli); } + /* either both istartpoint and istarttli should be set, or neither */ + Assert(XLogRecPtrIsInvalid(state->istartpoint) == (state->istarttli == 0)); + if (!XLogRecPtrIsInvalid(state->istartpoint)) + { + appendStringInfo(result, "INCREMENTAL FROM LSN: %X/%X\n", + LSN_FORMAT_ARGS(state->istartpoint)); + appendStringInfo(result, "INCREMENTAL FROM TLI: %u\n", + state->istarttli); + } + data = result->data; pfree(result); diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index a2c8fa3981ca6..6f4f81f99277f 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -1295,6 +1295,12 @@ read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI, tli_from_file, BACKUP_LABEL_FILE))); } + if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%X\n", &hi, &lo) > 0) + ereport(FATAL, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("this is an incremental backup, not a data directory"), + errhint("Use pg_combinebackup to reconstruct a valid data directory."))); + if (ferror(lfp) || FreeFile(lfp)) ereport(FATAL, (errcode_for_file_access(), diff --git a/src/backend/backup/Makefile b/src/backend/backup/Makefile index a67b3c58d4741..751e6d3d5e2e4 100644 --- a/src/backend/backup/Makefile +++ b/src/backend/backup/Makefile @@ -19,6 +19,7 @@ OBJS = \ basebackup.o \ basebackup_copy.o \ basebackup_gzip.o \ + basebackup_incremental.o \ basebackup_lz4.o \ basebackup_zstd.o \ basebackup_progress.o \ diff --git a/src/backend/backup/basebackup.c b/src/backend/backup/basebackup.c index 35dd79babcb37..5ee9628422e05 100644 --- a/src/backend/backup/basebackup.c +++ b/src/backend/backup/basebackup.c @@ -20,8 +20,10 @@ #include "access/xlogbackup.h" #include "backup/backup_manifest.h" #include "backup/basebackup.h" +#include "backup/basebackup_incremental.h" #include "backup/basebackup_sink.h" #include "backup/basebackup_target.h" +#include "catalog/pg_tablespace_d.h" #include "commands/defrem.h" #include "common/compression.h" #include "common/file_perm.h" @@ -33,6 +35,7 @@ #include "pgtar.h" #include "port.h" #include "postmaster/syslogger.h" +#include "postmaster/walsummarizer.h" #include "replication/walsender.h" #include "replication/walsender_private.h" #include "storage/bufpage.h" @@ -64,6 +67,7 @@ typedef struct bool fastcheckpoint; bool nowait; bool includewal; + bool incremental; uint32 maxrate; bool sendtblspcmapfile; bool send_to_client; @@ -76,21 +80,28 @@ typedef struct } basebackup_options; static int64 sendTablespace(bbsink *sink, char *path, Oid spcoid, bool sizeonly, - struct backup_manifest_info *manifest); + struct backup_manifest_info *manifest, + IncrementalBackupInfo *ib); static int64 sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, List *tablespaces, bool sendtblspclinks, - backup_manifest_info *manifest, Oid spcoid); + backup_manifest_info *manifest, Oid spcoid, + IncrementalBackupInfo *ib); static bool sendFile(bbsink *sink, const char *readfilename, const char *tarfilename, struct stat *statbuf, bool missing_ok, Oid dboid, Oid spcoid, RelFileNumber relfilenumber, unsigned segno, - backup_manifest_info *manifest); + backup_manifest_info *manifest, + unsigned num_incremental_blocks, + BlockNumber *incremental_blocks, + unsigned truncation_block_length); static off_t read_file_data_into_buffer(bbsink *sink, const char *readfilename, int fd, off_t offset, size_t length, BlockNumber blkno, bool verify_checksum, int *checksum_failures); +static void push_to_sink(bbsink *sink, pg_checksum_context *checksum_ctx, + size_t *bytes_done, void *data, size_t length); static bool verify_page_checksum(Page page, XLogRecPtr start_lsn, BlockNumber blkno, uint16 *expected_checksum); @@ -102,7 +113,8 @@ static int64 _tarWriteHeader(bbsink *sink, const char *filename, bool sizeonly); static void _tarWritePadding(bbsink *sink, int len); static void convert_link_to_directory(const char *pathbuf, struct stat *statbuf); -static void perform_base_backup(basebackup_options *opt, bbsink *sink); +static void perform_base_backup(basebackup_options *opt, bbsink *sink, + IncrementalBackupInfo *ib); static void parse_basebackup_options(List *options, basebackup_options *opt); static int compareWalFileNames(const ListCell *a, const ListCell *b); static int basebackup_read_file(int fd, char *buf, size_t nbytes, off_t offset, @@ -220,7 +232,8 @@ static const struct exclude_list_item excludeFiles[] = * clobbered by longjmp" from stupider versions of gcc. */ static void -perform_base_backup(basebackup_options *opt, bbsink *sink) +perform_base_backup(basebackup_options *opt, bbsink *sink, + IncrementalBackupInfo *ib) { bbsink_state state; XLogRecPtr endptr; @@ -270,6 +283,10 @@ perform_base_backup(basebackup_options *opt, bbsink *sink) ListCell *lc; tablespaceinfo *newti; + /* If this is an incremental backup, execute preparatory steps. */ + if (ib != NULL) + PrepareForIncrementalBackup(ib, backup_state); + /* Add a node for the base directory at the end */ newti = palloc0(sizeof(tablespaceinfo)); newti->size = -1; @@ -289,10 +306,10 @@ perform_base_backup(basebackup_options *opt, bbsink *sink) if (tmp->path == NULL) tmp->size = sendDir(sink, ".", 1, true, state.tablespaces, - true, NULL, InvalidOid); + true, NULL, InvalidOid, NULL); else tmp->size = sendTablespace(sink, tmp->path, tmp->oid, true, - NULL); + NULL, NULL); state.bytes_total += tmp->size; } state.bytes_total_is_valid = true; @@ -330,7 +347,7 @@ perform_base_backup(basebackup_options *opt, bbsink *sink) /* Then the bulk of the files... */ sendDir(sink, ".", 1, false, state.tablespaces, - sendtblspclinks, &manifest, InvalidOid); + sendtblspclinks, &manifest, InvalidOid, ib); /* ... and pg_control after everything else. */ if (lstat(XLOG_CONTROL_FILE, &statbuf) != 0) @@ -340,7 +357,7 @@ perform_base_backup(basebackup_options *opt, bbsink *sink) XLOG_CONTROL_FILE))); sendFile(sink, XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf, false, InvalidOid, InvalidOid, - InvalidRelFileNumber, 0, &manifest); + InvalidRelFileNumber, 0, &manifest, 0, NULL, 0); } else { @@ -348,7 +365,7 @@ perform_base_backup(basebackup_options *opt, bbsink *sink) bbsink_begin_archive(sink, archive_name); - sendTablespace(sink, ti->path, ti->oid, false, &manifest); + sendTablespace(sink, ti->path, ti->oid, false, &manifest, ib); } /* @@ -610,7 +627,7 @@ perform_base_backup(basebackup_options *opt, bbsink *sink) sendFile(sink, pathbuf, pathbuf, &statbuf, false, InvalidOid, InvalidOid, InvalidRelFileNumber, 0, - &manifest); + &manifest, 0, NULL, 0); /* unconditionally mark file as archived */ StatusFilePath(pathbuf, fname, ".done"); @@ -686,6 +703,7 @@ parse_basebackup_options(List *options, basebackup_options *opt) bool o_checkpoint = false; bool o_nowait = false; bool o_wal = false; + bool o_incremental = false; bool o_maxrate = false; bool o_tablespace_map = false; bool o_noverify_checksums = false; @@ -764,6 +782,20 @@ parse_basebackup_options(List *options, basebackup_options *opt) opt->includewal = defGetBoolean(defel); o_wal = true; } + else if (strcmp(defel->defname, "incremental") == 0) + { + if (o_incremental) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("duplicate option \"%s\"", defel->defname))); + opt->incremental = defGetBoolean(defel); + if (opt->incremental && !summarize_wal) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("incremental backups cannot be taken unless WAL summarization is enabled"))); + opt->incremental = defGetBoolean(defel); + o_incremental = true; + } else if (strcmp(defel->defname, "max_rate") == 0) { int64 maxrate; @@ -956,7 +988,7 @@ parse_basebackup_options(List *options, basebackup_options *opt) * the filesystem, bypassing the buffer cache. */ void -SendBaseBackup(BaseBackupCmd *cmd) +SendBaseBackup(BaseBackupCmd *cmd, IncrementalBackupInfo *ib) { basebackup_options opt; bbsink *sink; @@ -980,6 +1012,20 @@ SendBaseBackup(BaseBackupCmd *cmd) set_ps_display(activitymsg); } + /* + * If we're asked to perform an incremental backup and the user has not + * supplied a manifest, that's an ERROR. + * + * If we're asked to perform a full backup and the user did supply a + * manifest, just ignore it. + */ + if (!opt.incremental) + ib = NULL; + else if (ib == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("must UPLOAD_MANIFEST before performing an incremental BASE_BACKUP"))); + /* * If the target is specifically 'client' then set up to stream the backup * to the client; otherwise, it's being sent someplace else and should not @@ -1011,7 +1057,7 @@ SendBaseBackup(BaseBackupCmd *cmd) */ PG_TRY(); { - perform_base_backup(&opt, sink); + perform_base_backup(&opt, sink, ib); } PG_FINALLY(); { @@ -1089,7 +1135,7 @@ sendFileWithContent(bbsink *sink, const char *filename, const char *content, */ static int64 sendTablespace(bbsink *sink, char *path, Oid spcoid, bool sizeonly, - backup_manifest_info *manifest) + backup_manifest_info *manifest, IncrementalBackupInfo *ib) { int64 size; char pathbuf[MAXPGPATH]; @@ -1123,7 +1169,7 @@ sendTablespace(bbsink *sink, char *path, Oid spcoid, bool sizeonly, /* Send all the files in the tablespace version directory */ size += sendDir(sink, pathbuf, strlen(path), sizeonly, NIL, true, manifest, - spcoid); + spcoid, ib); return size; } @@ -1143,7 +1189,7 @@ sendTablespace(bbsink *sink, char *path, Oid spcoid, bool sizeonly, static int64 sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, List *tablespaces, bool sendtblspclinks, backup_manifest_info *manifest, - Oid spcoid) + Oid spcoid, IncrementalBackupInfo *ib) { DIR *dir; struct dirent *de; @@ -1152,7 +1198,16 @@ sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, int64 size = 0; const char *lastDir; /* Split last dir from parent path. */ bool isRelationDir = false; /* Does directory contain relations? */ + bool isGlobalDir = false; Oid dboid = InvalidOid; + BlockNumber *relative_block_numbers = NULL; + + /* + * Since this array is relatively large, avoid putting it on the stack. + * But we don't need it at all if this is not an incremental backup. + */ + if (ib != NULL) + relative_block_numbers = palloc(sizeof(BlockNumber) * RELSEG_SIZE); /* * Determine if the current path is a database directory that can contain @@ -1185,7 +1240,10 @@ sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, } } else if (strcmp(path, "./global") == 0) + { isRelationDir = true; + isGlobalDir = true; + } dir = AllocateDir(path); while ((de = ReadDir(dir, path)) != NULL) @@ -1334,11 +1392,13 @@ sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, &statbuf, sizeonly); /* - * Also send archive_status directory (by hackishly reusing - * statbuf from above ...). + * Also send archive_status and summaries directories (by + * hackishly reusing statbuf from above ...). */ size += _tarWriteHeader(sink, "./pg_wal/archive_status", NULL, &statbuf, sizeonly); + size += _tarWriteHeader(sink, "./pg_wal/summaries", NULL, + &statbuf, sizeonly); continue; /* don't recurse into pg_wal */ } @@ -1407,16 +1467,64 @@ sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, if (!skip_this_dir) size += sendDir(sink, pathbuf, basepathlen, sizeonly, tablespaces, - sendtblspclinks, manifest, spcoid); + sendtblspclinks, manifest, spcoid, ib); } else if (S_ISREG(statbuf.st_mode)) { bool sent = false; + unsigned num_blocks_required = 0; + unsigned truncation_block_length = 0; + char tarfilenamebuf[MAXPGPATH * 2]; + char *tarfilename = pathbuf + basepathlen + 1; + FileBackupMethod method = BACK_UP_FILE_FULLY; + + if (ib != NULL && isRelationFile) + { + Oid relspcoid; + char *lookup_path; + + if (OidIsValid(spcoid)) + { + relspcoid = spcoid; + lookup_path = psprintf("pg_tblspc/%u/%s", spcoid, + tarfilename); + } + else + { + if (isGlobalDir) + relspcoid = GLOBALTABLESPACE_OID; + else + relspcoid = DEFAULTTABLESPACE_OID; + lookup_path = pstrdup(tarfilename); + } + + method = GetFileBackupMethod(ib, lookup_path, dboid, relspcoid, + relfilenumber, relForkNum, + segno, statbuf.st_size, + &num_blocks_required, + relative_block_numbers, + &truncation_block_length); + if (method == BACK_UP_FILE_INCREMENTALLY) + { + statbuf.st_size = + GetIncrementalFileSize(num_blocks_required); + snprintf(tarfilenamebuf, sizeof(tarfilenamebuf), + "%s/INCREMENTAL.%s", + path + basepathlen + 1, + de->d_name); + tarfilename = tarfilenamebuf; + } + + pfree(lookup_path); + } if (!sizeonly) - sent = sendFile(sink, pathbuf, pathbuf + basepathlen + 1, &statbuf, + sent = sendFile(sink, pathbuf, tarfilename, &statbuf, true, dboid, spcoid, - relfilenumber, segno, manifest); + relfilenumber, segno, manifest, + num_blocks_required, + method == BACK_UP_FILE_INCREMENTALLY ? relative_block_numbers : NULL, + truncation_block_length); if (sent || sizeonly) { @@ -1434,6 +1542,10 @@ sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, ereport(WARNING, (errmsg("skipping special file \"%s\"", pathbuf))); } + + if (relative_block_numbers != NULL) + pfree(relative_block_numbers); + FreeDir(dir); return size; } @@ -1446,6 +1558,12 @@ sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, * If dboid is anything other than InvalidOid then any checksum failures * detected will get reported to the cumulative stats system. * + * If the file is to be sent incrementally, then num_incremental_blocks + * should be the number of blocks to be sent, and incremental_blocks + * an array of block numbers relative to the start of the current segment. + * If the whole file is to be sent, then incremental_blocks should be NULL, + * and num_incremental_blocks can have any value, as it will be ignored. + * * Returns true if the file was successfully sent, false if 'missing_ok', * and the file did not exist. */ @@ -1453,7 +1571,8 @@ static bool sendFile(bbsink *sink, const char *readfilename, const char *tarfilename, struct stat *statbuf, bool missing_ok, Oid dboid, Oid spcoid, RelFileNumber relfilenumber, unsigned segno, - backup_manifest_info *manifest) + backup_manifest_info *manifest, unsigned num_incremental_blocks, + BlockNumber *incremental_blocks, unsigned truncation_block_length) { int fd; BlockNumber blkno = 0; @@ -1462,6 +1581,7 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename, pgoff_t bytes_done = 0; bool verify_checksum = false; pg_checksum_context checksum_ctx; + int ibindex = 0; if (pg_checksum_init(&checksum_ctx, manifest->checksum_type) < 0) elog(ERROR, "could not initialize checksum of file \"%s\"", @@ -1494,22 +1614,111 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename, RelFileNumberIsValid(relfilenumber)) verify_checksum = true; + /* + * If we're sending an incremental file, write the file header. + */ + if (incremental_blocks != NULL) + { + unsigned magic = INCREMENTAL_MAGIC; + size_t header_bytes_done = 0; + + /* Emit header data. */ + push_to_sink(sink, &checksum_ctx, &header_bytes_done, + &magic, sizeof(magic)); + push_to_sink(sink, &checksum_ctx, &header_bytes_done, + &num_incremental_blocks, sizeof(num_incremental_blocks)); + push_to_sink(sink, &checksum_ctx, &header_bytes_done, + &truncation_block_length, sizeof(truncation_block_length)); + push_to_sink(sink, &checksum_ctx, &header_bytes_done, + incremental_blocks, + sizeof(BlockNumber) * num_incremental_blocks); + + /* Flush out any data still in the buffer so it's again empty. */ + if (header_bytes_done > 0) + { + bbsink_archive_contents(sink, header_bytes_done); + if (pg_checksum_update(&checksum_ctx, + (uint8 *) sink->bbs_buffer, + header_bytes_done) < 0) + elog(ERROR, "could not update checksum of base backup"); + } + + /* Update our notion of file position. */ + bytes_done += sizeof(magic); + bytes_done += sizeof(num_incremental_blocks); + bytes_done += sizeof(truncation_block_length); + bytes_done += sizeof(BlockNumber) * num_incremental_blocks; + } + /* * Loop until we read the amount of data the caller told us to expect. The * file could be longer, if it was extended while we were sending it, but * for a base backup we can ignore such extended data. It will be restored * from WAL. */ - while (bytes_done < statbuf->st_size) + while (1) { - size_t remaining = statbuf->st_size - bytes_done; + /* + * Determine whether we've read all the data that we need, and if not, + * read some more. + */ + if (incremental_blocks == NULL) + { + size_t remaining = statbuf->st_size - bytes_done; + + /* + * If we've read the required number of bytes, then it's time to + * stop. + */ + if (bytes_done >= statbuf->st_size) + break; + + /* + * Read as many bytes as will fit in the buffer, or however many + * are left to read, whichever is less. + */ + cnt = read_file_data_into_buffer(sink, readfilename, fd, + bytes_done, remaining, + blkno + segno * RELSEG_SIZE, + verify_checksum, + &checksum_failures); + } + else + { + BlockNumber relative_blkno; - /* Try to read some more data. */ - cnt = read_file_data_into_buffer(sink, readfilename, fd, bytes_done, - remaining, - blkno + segno * RELSEG_SIZE, - verify_checksum, - &checksum_failures); + /* + * If we've read all the blocks, then it's time to stop. + */ + if (ibindex >= num_incremental_blocks) + break; + + /* + * Read just one block, whichever one is the next that we're + * supposed to include. + */ + relative_blkno = incremental_blocks[ibindex++]; + cnt = read_file_data_into_buffer(sink, readfilename, fd, + relative_blkno * BLCKSZ, + BLCKSZ, + relative_blkno + segno * RELSEG_SIZE, + verify_checksum, + &checksum_failures); + + /* + * If we get a partial read, that must mean that the relation is + * being truncated. Ultimately, it should be truncated to a + * multiple of BLCKSZ, since this path should only be reached for + * relation files, but we might transiently observe an + * intermediate value. + * + * It should be fine to treat this just as if the entire block had + * been truncated away - i.e. fill this and all later blocks with + * zeroes. WAL replay will fix things up. + */ + if (cnt < BLCKSZ) + break; + } /* * If the amount of data we were able to read was not a multiple of @@ -1692,6 +1901,56 @@ read_file_data_into_buffer(bbsink *sink, const char *readfilename, int fd, return cnt; } +/* + * Push data into a bbsink. + * + * It's better, when possible, to read data directly into the bbsink's buffer, + * rather than using this function to copy it into the buffer; this function is + * for cases where that approach is not practical. + * + * bytes_done should point to a count of the number of bytes that are + * currently used in the bbsink's buffer. Upon return, the bytes identified by + * data and length will have been copied into the bbsink's buffer, flushing + * as required, and *bytes_done will have been updated accordingly. If the + * buffer was flushed, the previous contents will also have been fed to + * checksum_ctx. + * + * Note that after one or more calls to this function it is the caller's + * responsibility to perform any required final flush. + */ +static void +push_to_sink(bbsink *sink, pg_checksum_context *checksum_ctx, + size_t *bytes_done, void *data, size_t length) +{ + while (length > 0) + { + size_t bytes_to_copy; + + /* + * We use < here rather than <= so that if the data exactly fills the + * remaining buffer space, we trigger a flush now. + */ + if (length < sink->bbs_buffer_length - *bytes_done) + { + /* Append remaining data to buffer. */ + memcpy(sink->bbs_buffer + *bytes_done, data, length); + *bytes_done += length; + return; + } + + /* Copy until buffer is full and flush it. */ + bytes_to_copy = sink->bbs_buffer_length - *bytes_done; + memcpy(sink->bbs_buffer + *bytes_done, data, bytes_to_copy); + data = ((char *) data) + bytes_to_copy; + length -= bytes_to_copy; + bbsink_archive_contents(sink, sink->bbs_buffer_length); + if (pg_checksum_update(checksum_ctx, (uint8 *) sink->bbs_buffer, + sink->bbs_buffer_length) < 0) + elog(ERROR, "could not update checksum"); + *bytes_done = 0; + } +} + /* * Try to verify the checksum for the provided page, if it seems appropriate * to do so. diff --git a/src/backend/backup/basebackup_incremental.c b/src/backend/backup/basebackup_incremental.c new file mode 100644 index 0000000000000..1e5a5ac33adf7 --- /dev/null +++ b/src/backend/backup/basebackup_incremental.c @@ -0,0 +1,1003 @@ +/*------------------------------------------------------------------------- + * + * basebackup_incremental.c + * code for incremental backup support + * + * This code isn't actually in charge of taking an incremental backup; + * the actual construction of the incremental backup happens in + * basebackup.c. Here, we're concerned with providing the necessary + * supports for that operation. In particular, we need to parse the + * backup manifest supplied by the user taking the incremental backup + * and extract the required information from it. + * + * Portions Copyright (c) 2010-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/backup/basebackup_incremental.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/timeline.h" +#include "access/xlog.h" +#include "access/xlogrecovery.h" +#include "backup/basebackup_incremental.h" +#include "backup/walsummary.h" +#include "common/blkreftable.h" +#include "common/parse_manifest.h" +#include "common/hashfn.h" +#include "postmaster/walsummarizer.h" + +#define BLOCKS_PER_READ 512 + +/* + * Details extracted from the WAL ranges present in the supplied backup manifest. + */ +typedef struct +{ + TimeLineID tli; + XLogRecPtr start_lsn; + XLogRecPtr end_lsn; +} backup_wal_range; + +/* + * Details extracted from the file list present in the supplied backup manifest. + */ +typedef struct +{ + uint32 status; + const char *path; + size_t size; +} backup_file_entry; + +static uint32 hash_string_pointer(const char *s); +#define SH_PREFIX backup_file +#define SH_ELEMENT_TYPE backup_file_entry +#define SH_KEY_TYPE const char * +#define SH_KEY path +#define SH_HASH_KEY(tb, key) hash_string_pointer(key) +#define SH_EQUAL(tb, a, b) (strcmp(a, b) == 0) +#define SH_SCOPE static inline +#define SH_DECLARE +#define SH_DEFINE +#include "lib/simplehash.h" + +struct IncrementalBackupInfo +{ + /* Memory context for this object and its subsidiary objects. */ + MemoryContext mcxt; + + /* Temporary buffer for storing the manifest while parsing it. */ + StringInfoData buf; + + /* WAL ranges extracted from the backup manifest. */ + List *manifest_wal_ranges; + + /* + * Files extracted from the backup manifest. + * + * We don't really need this information, because we use WAL summaries to + * figure what's changed. It would be unsafe to just rely on the list of + * files that existed before, because it's possible for a file to be + * removed and a new one created with the same name and different + * contents. In such cases, the whole file must still be sent. We can tell + * from the WAL summaries whether that happened, but not from the file + * list. + * + * Nonetheless, this data is useful for sanity checking. If a file that we + * think we shouldn't need to send is not present in the manifest for the + * prior backup, something has gone terribly wrong. We retain the file + * names and sizes, but not the checksums or last modified times, for + * which we have no use. + * + * One significant downside of storing this data is that it consumes + * memory. If that turns out to be a problem, we might have to decide not + * to retain this information, or to make it optional. + */ + backup_file_hash *manifest_files; + + /* + * Block-reference table for the incremental backup. + * + * It's possible that storing the entire block-reference table in memory + * will be a problem for some users. The in-memory format that we're using + * here is pretty efficient, converging to little more than 1 bit per + * block for relation forks with large numbers of modified blocks. It's + * possible, however, that if you try to perform an incremental backup of + * a database with a sufficiently large number of relations on a + * sufficiently small machine, you could run out of memory here. If that + * turns out to be a problem in practice, we'll need to be more clever. + */ + BlockRefTable *brtab; +}; + +static void manifest_process_file(JsonManifestParseContext *context, + char *pathname, + size_t size, + pg_checksum_type checksum_type, + int checksum_length, + uint8 *checksum_payload); +static void manifest_process_wal_range(JsonManifestParseContext *context, + TimeLineID tli, + XLogRecPtr start_lsn, + XLogRecPtr end_lsn); +static void manifest_report_error(JsonManifestParseContext *ib, + const char *fmt,...) + pg_attribute_printf(2, 3) pg_attribute_noreturn(); +static int compare_block_numbers(const void *a, const void *b); + +/* + * Create a new object for storing information extracted from the manifest + * supplied when creating an incremental backup. + */ +IncrementalBackupInfo * +CreateIncrementalBackupInfo(MemoryContext mcxt) +{ + IncrementalBackupInfo *ib; + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(mcxt); + + ib = palloc0(sizeof(IncrementalBackupInfo)); + ib->mcxt = mcxt; + initStringInfo(&ib->buf); + + /* + * It's hard to guess how many files a "typical" installation will have in + * the data directory, but a fresh initdb creates almost 1000 files as of + * this writing, so it seems to make sense for our estimate to + * substantially higher. + */ + ib->manifest_files = backup_file_create(mcxt, 10000, NULL); + + MemoryContextSwitchTo(oldcontext); + + return ib; +} + +/* + * Before taking an incremental backup, the caller must supply the backup + * manifest from a prior backup. Each chunk of manifest data recieved + * from the client should be passed to this function. + */ +void +AppendIncrementalManifestData(IncrementalBackupInfo *ib, const char *data, + int len) +{ + MemoryContext oldcontext; + + /* Switch to our memory context. */ + oldcontext = MemoryContextSwitchTo(ib->mcxt); + + /* + * XXX. Our json parser is at present incapable of parsing json blobs + * incrementally, so we have to accumulate the entire backup manifest + * before we can do anything with it. This should really be fixed, since + * some users might have very large numbers of files in the data + * directory. + */ + appendBinaryStringInfo(&ib->buf, data, len); + + /* Switch back to previous memory context. */ + MemoryContextSwitchTo(oldcontext); +} + +/* + * Finalize an IncrementalBackupInfo object after all manifest data has + * been supplied via calls to AppendIncrementalManifestData. + */ +void +FinalizeIncrementalManifest(IncrementalBackupInfo *ib) +{ + JsonManifestParseContext context; + MemoryContext oldcontext; + + /* Switch to our memory context. */ + oldcontext = MemoryContextSwitchTo(ib->mcxt); + + /* Parse the manifest. */ + context.private_data = ib; + context.per_file_cb = manifest_process_file; + context.per_wal_range_cb = manifest_process_wal_range; + context.error_cb = manifest_report_error; + json_parse_manifest(&context, ib->buf.data, ib->buf.len); + + /* Done with the buffer, so release memory. */ + pfree(ib->buf.data); + ib->buf.data = NULL; + + /* Switch back to previous memory context. */ + MemoryContextSwitchTo(oldcontext); +} + +/* + * Prepare to take an incremental backup. + * + * Before this function is called, AppendIncrementalManifestData and + * FinalizeIncrementalManifest should have already been called to pass all + * the manifest data to this object. + * + * This function performs sanity checks on the data extracted from the + * manifest and figures out for which WAL ranges we need summaries, and + * whether those summaries are available. Then, it reads and combines the + * data from those summary files. It also updates the backup_state with the + * reference TLI and LSN for the prior backup. + */ +void +PrepareForIncrementalBackup(IncrementalBackupInfo *ib, + BackupState *backup_state) +{ + MemoryContext oldcontext; + List *expectedTLEs; + List *all_wslist, + *required_wslist = NIL; + ListCell *lc; + TimeLineHistoryEntry **tlep; + int num_wal_ranges; + int i; + bool found_backup_start_tli = false; + TimeLineID earliest_wal_range_tli = 0; + XLogRecPtr earliest_wal_range_start_lsn = InvalidXLogRecPtr; + TimeLineID latest_wal_range_tli = 0; + XLogRecPtr summarized_lsn; + XLogRecPtr pending_lsn; + XLogRecPtr prior_pending_lsn = InvalidXLogRecPtr; + int deadcycles = 0; + TimestampTz initial_time, + current_time; + + Assert(ib->buf.data == NULL); + + /* Switch to our memory context. */ + oldcontext = MemoryContextSwitchTo(ib->mcxt); + + /* + * A valid backup manifest must always contain at least one WAL range + * (usually exactly one, unless the backup spanned a timeline switch). + */ + num_wal_ranges = list_length(ib->manifest_wal_ranges); + if (num_wal_ranges == 0) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("manifest contains no required WAL ranges"))); + + /* + * Match up the TLIs that appear in the WAL ranges of the backup manifest + * with those that appear in this server's timeline history. We expect + * every backup_wal_range to match to a TimeLineHistoryEntry; if it does + * not, that's an error. + * + * This loop also decides which of the WAL ranges is the manifest is most + * ancient and which one is the newest, according to the timeline history + * of this server, and stores TLIs of those WAL ranges into + * earliest_wal_range_tli and latest_wal_range_tli. It also updates + * earliest_wal_range_start_lsn to the start LSN of the WAL range for + * earliest_wal_range_tli. + * + * Note that the return value of readTimeLineHistory puts the latest + * timeline at the beginning of the list, not the end. Hence, the earliest + * TLI is the one that occurs nearest the end of the list returned by + * readTimeLineHistory, and the latest TLI is the one that occurs closest + * to the beginning. + */ + expectedTLEs = readTimeLineHistory(backup_state->starttli); + tlep = palloc0(num_wal_ranges * sizeof(TimeLineHistoryEntry *)); + for (i = 0; i < num_wal_ranges; ++i) + { + backup_wal_range *range = list_nth(ib->manifest_wal_ranges, i); + bool saw_earliest_wal_range_tli = false; + bool saw_latest_wal_range_tli = false; + + /* Search this server's history for this WAL range's TLI. */ + foreach(lc, expectedTLEs) + { + TimeLineHistoryEntry *tle = lfirst(lc); + + if (tle->tli == range->tli) + { + tlep[i] = tle; + break; + } + + if (tle->tli == earliest_wal_range_tli) + saw_earliest_wal_range_tli = true; + if (tle->tli == latest_wal_range_tli) + saw_latest_wal_range_tli = true; + } + + /* + * An incremental backup can only be taken relative to a backup that + * represents a previous state of this server. If the backup requires + * WAL from a timeline that's not in our history, that definitely + * isn't the case. + */ + if (tlep[i] == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("timeline %u found in manifest, but not in this server's history", + range->tli))); + + /* + * If we found this TLI in the server's history before encountering + * the latest TLI seen so far in the server's history, then this TLI + * is the latest one seen so far. + * + * If on the other hand we saw the earliest TLI seen so far before + * finding this TLI, this TLI is earlier than the earliest one seen so + * far. And if this is the first TLI for which we've searched, it's + * also the earliest one seen so far. + * + * On the first loop iteration, both things should necessarily be + * true. + */ + if (!saw_latest_wal_range_tli) + latest_wal_range_tli = range->tli; + if (earliest_wal_range_tli == 0 || saw_earliest_wal_range_tli) + { + earliest_wal_range_tli = range->tli; + earliest_wal_range_start_lsn = range->start_lsn; + } + } + + /* + * Propagate information about the prior backup into the backup_label that + * will be generated for this backup. + */ + backup_state->istartpoint = earliest_wal_range_start_lsn; + backup_state->istarttli = earliest_wal_range_tli; + + /* + * Sanity check start and end LSNs for the WAL ranges in the manifest. + * + * Commonly, there won't be any timeline switches during the prior backup + * at all, but if there are, they should happen at the same LSNs that this + * server switched timelines. + * + * Whether there are any timeline switches during the prior backup or not, + * the prior backup shouldn't require any WAL from a timeline prior to the + * start of that timeline. It also shouldn't require any WAL from later + * than the start of this backup. + * + * If any of these sanity checks fail, one possible explanation is that + * the user has generated WAL on the same timeline with the same LSNs more + * than once. For instance, if two standbys running on timeline 1 were + * both promoted and (due to a broken archiving setup) both selected new + * timeline ID 2, then it's possible that one of these checks might trip. + * + * Note that there are lots of ways for the user to do something very bad + * without tripping any of these checks, and they are not intended to be + * comprehensive. It's pretty hard to see how we could be certain of + * anything here. However, if there's a problem staring us right in the + * face, it's best to report it, so we do. + */ + for (i = 0; i < num_wal_ranges; ++i) + { + backup_wal_range *range = list_nth(ib->manifest_wal_ranges, i); + + if (range->tli == earliest_wal_range_tli) + { + if (range->start_lsn < tlep[i]->begin) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("manifest requires WAL from initial timeline %u starting at %X/%X, but that timeline begins at %X/%X", + range->tli, + LSN_FORMAT_ARGS(range->start_lsn), + LSN_FORMAT_ARGS(tlep[i]->begin)))); + } + else + { + if (range->start_lsn != tlep[i]->begin) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("manifest requires WAL from continuation timeline %u starting at %X/%X, but that timeline begins at %X/%X", + range->tli, + LSN_FORMAT_ARGS(range->start_lsn), + LSN_FORMAT_ARGS(tlep[i]->begin)))); + } + + if (range->tli == latest_wal_range_tli) + { + if (range->end_lsn > backup_state->startpoint) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("manifest requires WAL from final timeline %u ending at %X/%X, but this backup starts at %X/%X", + range->tli, + LSN_FORMAT_ARGS(range->end_lsn), + LSN_FORMAT_ARGS(backup_state->startpoint)))); + } + else + { + if (range->end_lsn != tlep[i]->end) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("manifest requires WAL from non-final timeline %u ending at %X/%X, but this server switched timelines at %X/%X", + range->tli, + LSN_FORMAT_ARGS(range->end_lsn), + LSN_FORMAT_ARGS(tlep[i]->end)))); + } + + } + + /* + * Wait for WAL summarization to catch up to the backup start LSN (but + * time out if it doesn't do so quickly enough). + */ + initial_time = current_time = GetCurrentTimestamp(); + while (1) + { + long timeout_in_ms = 10000; + unsigned elapsed_seconds; + + /* + * Align the wait time to prevent drift. This doesn't really matter, + * but we'd like the warnings about how long we've been waiting to say + * 10 seconds, 20 seconds, 30 seconds, 40 seconds ... without ever + * drifting to something that is not a multiple of ten. + */ + timeout_in_ms -= + TimestampDifferenceMilliseconds(current_time, initial_time) % + timeout_in_ms; + + /* Wait for up to 10 seconds. */ + summarized_lsn = WaitForWalSummarization(backup_state->startpoint, + 10000, &pending_lsn); + + /* If WAL summarization has progressed sufficiently, stop waiting. */ + if (summarized_lsn >= backup_state->startpoint) + break; + + /* + * Keep track of the number of cycles during which there has been no + * progression of pending_lsn. If pending_lsn is not advancing, that + * means that not only are no new files appearing on disk, but we're + * not even incorporating new records into the in-memory state. + */ + if (pending_lsn > prior_pending_lsn) + { + prior_pending_lsn = pending_lsn; + deadcycles = 0; + } + else + ++deadcycles; + + /* + * If we've managed to wait for an entire minute withot the WAL + * summarizer absorbing a single WAL record, error out; probably + * something is wrong. + * + * We could consider also erroring out if the summarizer is taking too + * long to catch up, but it's not clear what rate of progress would be + * acceptable and what would be too slow. So instead, we just try to + * error out in the case where there's no progress at all. That seems + * likely to catch a reasonable number of the things that can go wrong + * in practice (e.g. the summarizer process is completely hung, say + * because somebody hooked up a debugger to it or something) without + * giving up too quickly when the sytem is just slow. + */ + if (deadcycles >= 6) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("WAL summarization is not progressing"), + errdetail("Summarization is needed through %X/%X, but is stuck at %X/%X on disk and %X/%X in memory.", + LSN_FORMAT_ARGS(backup_state->startpoint), + LSN_FORMAT_ARGS(summarized_lsn), + LSN_FORMAT_ARGS(pending_lsn)))); + + /* + * Otherwise, just let the user know what's happening. + */ + current_time = GetCurrentTimestamp(); + elapsed_seconds = + TimestampDifferenceMilliseconds(initial_time, current_time) / 1000; + ereport(WARNING, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("still waiting for WAL summarization through %X/%X after %d seconds", + LSN_FORMAT_ARGS(backup_state->startpoint), + elapsed_seconds), + errdetail("Summarization has reached %X/%X on disk and %X/%X in memory.", + LSN_FORMAT_ARGS(summarized_lsn), + LSN_FORMAT_ARGS(pending_lsn)))); + } + + /* + * Retrieve a list of all WAL summaries on any timeline that overlap with + * the LSN range of interest. We could instead call GetWalSummaries() once + * per timeline in the loop that follows, but that would involve reading + * the directory multiple times. It should be mildly faster - and perhaps + * a bit safer - to do it just once. + */ + all_wslist = GetWalSummaries(0, earliest_wal_range_start_lsn, + backup_state->startpoint); + + /* + * We need WAL summaries for everything that happened during the prior + * backup and everything that happened afterward up until the point where + * the current backup started. + */ + foreach(lc, expectedTLEs) + { + TimeLineHistoryEntry *tle = lfirst(lc); + XLogRecPtr tli_start_lsn = tle->begin; + XLogRecPtr tli_end_lsn = tle->end; + XLogRecPtr tli_missing_lsn = InvalidXLogRecPtr; + List *tli_wslist; + + /* + * Working through the history of this server from the current + * timeline backwards, we skip everything until we find the timeline + * where this backup started. Most of the time, this means we won't + * skip anything at all, as it's unlikely that the timeline has + * changed since the beginning of the backup moments ago. + */ + if (tle->tli == backup_state->starttli) + { + found_backup_start_tli = true; + tli_end_lsn = backup_state->startpoint; + } + else if (!found_backup_start_tli) + continue; + + /* + * Find the summaries that overlap the LSN range of interest for this + * timeline. If this is the earliest timeline involved, the range of + * interest begins with the start LSN of the prior backup; otherwise, + * it begins at the LSN at which this timeline came into existence. If + * this is the latest TLI involved, the range of interest ends at the + * start LSN of the current backup; otherwise, it ends at the point + * where we switched from this timeline to the next one. + */ + if (tle->tli == earliest_wal_range_tli) + tli_start_lsn = earliest_wal_range_start_lsn; + tli_wslist = FilterWalSummaries(all_wslist, tle->tli, + tli_start_lsn, tli_end_lsn); + + /* + * There is no guarantee that the WAL summaries we found cover the + * entire range of LSNs for which summaries are required, or indeed + * that we found any WAL summaries at all. Check whether we have a + * problem of that sort. + */ + if (!WalSummariesAreComplete(tli_wslist, tli_start_lsn, tli_end_lsn, + &tli_missing_lsn)) + { + if (XLogRecPtrIsInvalid(tli_missing_lsn)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("WAL summaries are required on timeline %u from %X/%X to %X/%X, but no summaries for that timeline and LSN range exist", + tle->tli, + LSN_FORMAT_ARGS(tli_start_lsn), + LSN_FORMAT_ARGS(tli_end_lsn)))); + else + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("WAL summaries are required on timeline %u from %X/%X to %X/%X, but the summaries for that timeline and LSN range are incomplete", + tle->tli, + LSN_FORMAT_ARGS(tli_start_lsn), + LSN_FORMAT_ARGS(tli_end_lsn)), + errdetail("The first unsummarized LSN is this range is %X/%X.", + LSN_FORMAT_ARGS(tli_missing_lsn)))); + } + + /* + * Remember that we need to read these summaries. + * + * Technically, it's possible that this could read more files than + * required, since tli_wslist in theory could contain redundant + * summaries. For instance, if we have a summary from 0/10000000 to + * 0/20000000 and also one from 0/00000000 to 0/30000000, then the + * latter subsumes the former and the former could be ignored. + * + * We ignore this possibility because the WAL summarizer only tries to + * generate summaries that do not overlap. If somehow they exist, + * we'll do a bit of extra work but the results should still be + * correct. + */ + required_wslist = list_concat(required_wslist, tli_wslist); + + /* + * Timelines earlier than the one in which the prior backup began are + * not relevant. + */ + if (tle->tli == earliest_wal_range_tli) + break; + } + + /* + * Read all of the required block reference table files and merge all of + * the data into a single in-memory block reference table. + * + * See the comments for struct IncrementalBackupInfo for some thoughts on + * memory usage. + */ + ib->brtab = CreateEmptyBlockRefTable(); + foreach(lc, required_wslist) + { + WalSummaryFile *ws = lfirst(lc); + WalSummaryIO wsio; + BlockRefTableReader *reader; + RelFileLocator rlocator; + ForkNumber forknum; + BlockNumber limit_block; + BlockNumber blocks[BLOCKS_PER_READ]; + + wsio.file = OpenWalSummaryFile(ws, false); + wsio.filepos = 0; + ereport(DEBUG1, + (errmsg_internal("reading WAL summary file \"%s\"", + FilePathName(wsio.file)))); + reader = CreateBlockRefTableReader(ReadWalSummary, &wsio, + FilePathName(wsio.file), + ReportWalSummaryError, NULL); + while (BlockRefTableReaderNextRelation(reader, &rlocator, &forknum, + &limit_block)) + { + BlockRefTableSetLimitBlock(ib->brtab, &rlocator, + forknum, limit_block); + + while (1) + { + unsigned nblocks; + unsigned i; + + nblocks = BlockRefTableReaderGetBlocks(reader, blocks, + BLOCKS_PER_READ); + if (nblocks == 0) + break; + + for (i = 0; i < nblocks; ++i) + BlockRefTableMarkBlockModified(ib->brtab, &rlocator, + forknum, blocks[i]); + } + } + DestroyBlockRefTableReader(reader); + FileClose(wsio.file); + } + + /* Switch back to previous memory context. */ + MemoryContextSwitchTo(oldcontext); +} + +/* + * Get the pathname that should be used when a file is sent incrementally. + * + * The result is a palloc'd string. + */ +char * +GetIncrementalFilePath(Oid dboid, Oid spcoid, RelFileNumber relfilenumber, + ForkNumber forknum, unsigned segno) +{ + char *path; + char *lastslash; + char *ipath; + + path = GetRelationPath(dboid, spcoid, relfilenumber, InvalidBackendId, + forknum); + + lastslash = strrchr(path, '/'); + Assert(lastslash != NULL); + *lastslash = '\0'; + + if (segno > 0) + ipath = psprintf("%s/INCREMENTAL.%s.%u", path, lastslash + 1, segno); + else + ipath = psprintf("%s/INCREMENTAL.%s", path, lastslash + 1); + + pfree(path); + + return ipath; +} + +/* + * How should we back up a particular file as part of an incremental backup? + * + * If the return value is BACK_UP_FILE_FULLY, caller should back up the whole + * file just as if this were not an incremental backup. + * + * If the return value is BACK_UP_FILE_INCREMENTALLY, caller should include + * an incremental file in the backup instead of the entire file. On return, + * *num_blocks_required will be set to the number of blocks that need to be + * sent, and the actual block numbers will have been stored in + * relative_block_numbers, which should be an array of at least RELSEG_SIZE. + * In addition, *truncation_block_length will be set to the value that should + * be included in the incremental file. + */ +FileBackupMethod +GetFileBackupMethod(IncrementalBackupInfo *ib, const char *path, + Oid dboid, Oid spcoid, + RelFileNumber relfilenumber, ForkNumber forknum, + unsigned segno, size_t size, + unsigned *num_blocks_required, + BlockNumber *relative_block_numbers, + unsigned *truncation_block_length) +{ + BlockNumber absolute_block_numbers[RELSEG_SIZE]; + BlockNumber limit_block; + BlockNumber start_blkno; + BlockNumber stop_blkno; + RelFileLocator rlocator; + BlockRefTableEntry *brtentry; + unsigned i; + unsigned nblocks; + + /* Should only be called after PrepareForIncrementalBackup. */ + Assert(ib->buf.data == NULL); + + /* + * dboid could be InvalidOid if shared rel, but spcoid and relfilenumber + * should have legal values. + */ + Assert(OidIsValid(spcoid)); + Assert(RelFileNumberIsValid(relfilenumber)); + + /* + * If the file size is too large or not a multiple of BLCKSZ, then + * something weird is happening, so give up and send the whole file. + */ + if ((size % BLCKSZ) != 0 || size / BLCKSZ > RELSEG_SIZE) + return BACK_UP_FILE_FULLY; + + /* + * The free-space map fork is not properly WAL-logged, so we need to + * backup the entire file every time. + */ + if (forknum == FSM_FORKNUM) + return BACK_UP_FILE_FULLY; + + /* + * If this file was not part of the prior backup, back it up fully. + * + * If this file was created after the prior backup and before the start of + * the current backup, then the WAL summary information will tell us to + * back up the whole file. However, if this file was created after the + * start of the current backup, then the WAL summary won't know anything + * about it. Without this logic, we would erroneously conclude that it was + * OK to send it incrementally. + * + * Note that the file could have existed at the time of the prior backup, + * gotten deleted, and then a new file with the same name could have been + * created. In that case, this logic won't prevent the file from being + * backed up incrementally. But, if the deletion happened before the start + * of the current backup, the limit block will be 0, inducing a full + * backup. If the deletion happened after the start of the current backup, + * reconstruction will erroneously combine blocks from the current + * lifespan of the file with blocks from the previous lifespan -- but in + * this type of case, WAL replay to reach backup consistency should remove + * and recreate the file anyway, so the initial bogus contents should not + * matter. + */ + if (backup_file_lookup(ib->manifest_files, path) == NULL) + { + char *ipath; + + ipath = GetIncrementalFilePath(dboid, spcoid, relfilenumber, + forknum, segno); + if (backup_file_lookup(ib->manifest_files, ipath) == NULL) + return BACK_UP_FILE_FULLY; + } + + /* Look up the block reference table entry. */ + rlocator.spcOid = spcoid; + rlocator.dbOid = dboid; + rlocator.relNumber = relfilenumber; + brtentry = BlockRefTableGetEntry(ib->brtab, &rlocator, forknum, + &limit_block); + + /* + * If there is no entry, then there have been no WAL-logged changes to the + * relation since the predecessor backup was taken, so we can back it up + * incrementally and need not include any modified blocks. + * + * However, if the file is zero-length, we should do a full backup, + * because an incremental file is always more than zero length, and it's + * silly to take an incremental backup when a full backup would be + * smaller. + */ + if (brtentry == NULL) + { + if (size == 0) + return BACK_UP_FILE_FULLY; + *num_blocks_required = 0; + *truncation_block_length = size / BLCKSZ; + return BACK_UP_FILE_INCREMENTALLY; + } + + /* + * If the limit_block is less than or equal to the point where this + * segment starts, send the whole file. + */ + if (limit_block <= segno * RELSEG_SIZE) + return BACK_UP_FILE_FULLY; + + /* + * Get relevant entries from the block reference table entry. + * + * We shouldn't overflow computing the start or stop block numbers, but if + * it manages to happen somehow, detect it and throw an error. + */ + start_blkno = segno * RELSEG_SIZE; + stop_blkno = start_blkno + (size / BLCKSZ); + if (start_blkno / RELSEG_SIZE != segno || stop_blkno < start_blkno) + ereport(ERROR, + errcode(ERRCODE_INTERNAL_ERROR), + errmsg_internal("overflow computing block number bounds for segment %u with size %zu", + segno, size)); + nblocks = BlockRefTableEntryGetBlocks(brtentry, start_blkno, stop_blkno, + absolute_block_numbers, RELSEG_SIZE); + Assert(nblocks <= RELSEG_SIZE); + + /* + * If we're going to have to send nearly all of the blocks, then just send + * the whole file, because that won't require much extra storage or + * transfer and will speed up and simplify backup restoration. It's not + * clear what threshold is most appropriate here and perhaps it ought to + * be configurable, but for now we're just going to say that if we'd need + * to send 90% of the blocks anyway, give up and send the whole file. + * + * NB: If you change the threshold here, at least make sure to back up the + * file fully when every single block must be sent, because there's + * nothing good about sending an incremental file in that case. + */ + if (nblocks * BLCKSZ > size * 0.9) + return BACK_UP_FILE_FULLY; + + /* + * Looks like we can send an incremental file, so sort the absolute the + * block numbers and then transpose absolute block numbers to relative + * block numbers. + * + * NB: If the block reference table was using the bitmap representation + * for a given chunk, the block numbers in that chunk will already be + * sorted, but when the array-of-offsets representation is used, we can + * receive block numbers here out of order. + */ + qsort(absolute_block_numbers, nblocks, sizeof(BlockNumber), + compare_block_numbers); + for (i = 0; i < nblocks; ++i) + relative_block_numbers[i] = absolute_block_numbers[i] - start_blkno; + *num_blocks_required = nblocks; + + /* + * The truncation block length is the minimum length of the reconstructed + * file. Any block numbers below this threshold that are not present in + * the backup need to be fetched from the prior backup. At or above this + * threshold, blocks should only be included in the result if they are + * present in the backup. (This may require inserting zero blocks if the + * blocks included in the backup are non-consecutive.) + */ + *truncation_block_length = size / BLCKSZ; + if (BlockNumberIsValid(limit_block)) + { + unsigned relative_limit = limit_block - segno * RELSEG_SIZE; + + if (*truncation_block_length < relative_limit) + *truncation_block_length = relative_limit; + } + + /* Send it incrementally. */ + return BACK_UP_FILE_INCREMENTALLY; +} + +/* + * Compute the size for an incremental file containing a given number of blocks. + */ +extern size_t +GetIncrementalFileSize(unsigned num_blocks_required) +{ + size_t result; + + /* Make sure we're not going to overflow. */ + Assert(num_blocks_required <= RELSEG_SIZE); + + /* + * Three four byte quantities (magic number, truncation block length, + * block count) followed by block numbers followed by block contents. + */ + result = 3 * sizeof(uint32); + result += (BLCKSZ + sizeof(BlockNumber)) * num_blocks_required; + + return result; +} + +/* + * Helper function for filemap hash table. + */ +static uint32 +hash_string_pointer(const char *s) +{ + unsigned char *ss = (unsigned char *) s; + + return hash_bytes(ss, strlen(s)); +} + +/* + * This callback is invoked for each file mentioned in the backup manifest. + * + * We store the path to each file and the size of each file for sanity-checking + * purposes. For further details, see comments for IncrementalBackupInfo. + */ +static void +manifest_process_file(JsonManifestParseContext *context, + char *pathname, size_t size, + pg_checksum_type checksum_type, + int checksum_length, + uint8 *checksum_payload) +{ + IncrementalBackupInfo *ib = context->private_data; + backup_file_entry *entry; + bool found; + + entry = backup_file_insert(ib->manifest_files, pathname, &found); + if (!found) + { + entry->path = MemoryContextStrdup(ib->manifest_files->ctx, + pathname); + entry->size = size; + } +} + +/* + * This callback is invoked for each WAL range mentioned in the backup + * manifest. + * + * We're just interested in learning the oldest LSN and the corresponding TLI + * that appear in any WAL range. + */ +static void +manifest_process_wal_range(JsonManifestParseContext *context, + TimeLineID tli, XLogRecPtr start_lsn, + XLogRecPtr end_lsn) +{ + IncrementalBackupInfo *ib = context->private_data; + backup_wal_range *range = palloc(sizeof(backup_wal_range)); + + range->tli = tli; + range->start_lsn = start_lsn; + range->end_lsn = end_lsn; + ib->manifest_wal_ranges = lappend(ib->manifest_wal_ranges, range); +} + +/* + * This callback is invoked if an error occurs while parsing the backup + * manifest. + */ +static void +manifest_report_error(JsonManifestParseContext *context, const char *fmt,...) +{ + StringInfoData errbuf; + + initStringInfo(&errbuf); + + for (;;) + { + va_list ap; + int needed; + + va_start(ap, fmt); + needed = appendStringInfoVA(&errbuf, fmt, ap); + va_end(ap); + if (needed == 0) + break; + enlargeStringInfo(&errbuf, needed); + } + + ereport(ERROR, + errmsg_internal("%s", errbuf.data)); +} + +/* + * Quicksort comparator for block numbers. + */ +static int +compare_block_numbers(const void *a, const void *b) +{ + BlockNumber aa = *(BlockNumber *) a; + BlockNumber bb = *(BlockNumber *) b; + + if (aa > bb) + return 1; + else if (aa == bb) + return 0; + else + return -1; +} diff --git a/src/backend/backup/meson.build b/src/backend/backup/meson.build index 5d4ebe3ebeddd..2a6a2dc7c0e8d 100644 --- a/src/backend/backup/meson.build +++ b/src/backend/backup/meson.build @@ -5,6 +5,7 @@ backend_sources += files( 'basebackup.c', 'basebackup_copy.c', 'basebackup_gzip.c', + 'basebackup_incremental.c', 'basebackup_lz4.c', 'basebackup_progress.c', 'basebackup_server.c', diff --git a/src/backend/replication/repl_gram.y b/src/backend/replication/repl_gram.y index 0c874e33cf6ab..a5d118ed683aa 100644 --- a/src/backend/replication/repl_gram.y +++ b/src/backend/replication/repl_gram.y @@ -76,11 +76,12 @@ Node *replication_parse_result; %token K_EXPORT_SNAPSHOT %token K_NOEXPORT_SNAPSHOT %token K_USE_SNAPSHOT +%token K_UPLOAD_MANIFEST %type command %type base_backup start_replication start_logical_replication create_replication_slot drop_replication_slot identify_system - read_replication_slot timeline_history show + read_replication_slot timeline_history show upload_manifest %type generic_option_list %type generic_option %type opt_timeline @@ -114,6 +115,7 @@ command: | read_replication_slot | timeline_history | show + | upload_manifest ; /* @@ -307,6 +309,15 @@ timeline_history: } ; +/* UPLOAD_MANIFEST doesn't currently accept any arguments */ +upload_manifest: + K_UPLOAD_MANIFEST + { + UploadManifestCmd *cmd = makeNode(UploadManifestCmd); + + $$ = (Node *) cmd; + } + opt_physical: K_PHYSICAL | /* EMPTY */ @@ -411,6 +422,7 @@ ident_or_keyword: | K_EXPORT_SNAPSHOT { $$ = "export_snapshot"; } | K_NOEXPORT_SNAPSHOT { $$ = "noexport_snapshot"; } | K_USE_SNAPSHOT { $$ = "use_snapshot"; } + | K_UPLOAD_MANIFEST { $$ = "upload_manifest"; } ; %% diff --git a/src/backend/replication/repl_scanner.l b/src/backend/replication/repl_scanner.l index 1cc7fb858cd58..4805da08ee3df 100644 --- a/src/backend/replication/repl_scanner.l +++ b/src/backend/replication/repl_scanner.l @@ -136,6 +136,7 @@ EXPORT_SNAPSHOT { return K_EXPORT_SNAPSHOT; } NOEXPORT_SNAPSHOT { return K_NOEXPORT_SNAPSHOT; } USE_SNAPSHOT { return K_USE_SNAPSHOT; } WAIT { return K_WAIT; } +UPLOAD_MANIFEST { return K_UPLOAD_MANIFEST; } {space}+ { /* do nothing */ } @@ -303,6 +304,7 @@ replication_scanner_is_replication_command(void) case K_DROP_REPLICATION_SLOT: case K_READ_REPLICATION_SLOT: case K_TIMELINE_HISTORY: + case K_UPLOAD_MANIFEST: case K_SHOW: /* Yes; push back the first token so we can parse later. */ repl_pushed_back_token = first_token; diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 3bc9c823895e1..dbcda325540dc 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -58,6 +58,7 @@ #include "access/xlogrecovery.h" #include "access/xlogutils.h" #include "backup/basebackup.h" +#include "backup/basebackup_incremental.h" #include "catalog/pg_authid.h" #include "catalog/pg_type.h" #include "commands/dbcommands.h" @@ -137,6 +138,17 @@ bool wake_wal_senders = false; */ static XLogReaderState *xlogreader = NULL; +/* + * If the UPLOAD_MANIFEST command is used to provide a backup manifest in + * preparation for an incremental backup, uploaded_manifest will be point + * to an object containing information about its contexts, and + * uploaded_manifest_mcxt will point to the memory context that contains + * that object and all of its subordinate data. Otherwise, both values will + * be NULL. + */ +static IncrementalBackupInfo *uploaded_manifest = NULL; +static MemoryContext uploaded_manifest_mcxt = NULL; + /* * These variables keep track of the state of the timeline we're currently * sending. sendTimeLine identifies the timeline. If sendTimeLineIsHistoric, @@ -233,6 +245,9 @@ static void XLogSendLogical(void); static void WalSndDone(WalSndSendDataCallback send_data); static XLogRecPtr GetStandbyFlushRecPtr(TimeLineID *tli); static void IdentifySystem(void); +static void UploadManifest(void); +static bool HandleUploadManifestPacket(StringInfo buf, off_t *offset, + IncrementalBackupInfo *ib); static void ReadReplicationSlot(ReadReplicationSlotCmd *cmd); static void CreateReplicationSlot(CreateReplicationSlotCmd *cmd); static void DropReplicationSlot(DropReplicationSlotCmd *cmd); @@ -660,6 +675,143 @@ SendTimeLineHistory(TimeLineHistoryCmd *cmd) pq_endmessage(&buf); } +/* + * Handle UPLOAD_MANIFEST command. + */ +static void +UploadManifest(void) +{ + MemoryContext mcxt; + IncrementalBackupInfo *ib; + off_t offset = 0; + StringInfoData buf; + + /* + * parsing the manifest will use the cryptohash stuff, which requires a + * resource owner + */ + Assert(CurrentResourceOwner == NULL); + CurrentResourceOwner = ResourceOwnerCreate(NULL, "base backup"); + + /* Prepare to read manifest data into a temporary context. */ + mcxt = AllocSetContextCreate(CurrentMemoryContext, + "incremental backup information", + ALLOCSET_DEFAULT_SIZES); + ib = CreateIncrementalBackupInfo(mcxt); + + /* Send a CopyInResponse message */ + pq_beginmessage(&buf, 'G'); + pq_sendbyte(&buf, 0); + pq_sendint16(&buf, 0); + pq_endmessage_reuse(&buf); + pq_flush(); + + /* Recieve packets from client until done. */ + while (HandleUploadManifestPacket(&buf, &offset, ib)) + ; + + /* Finish up manifest processing. */ + FinalizeIncrementalManifest(ib); + + /* + * Discard any old manifest information and arrange to preserve the new + * information we just got. + * + * We assume that MemoryContextDelete and MemoryContextSetParent won't + * fail, and thus we shouldn't end up bailing out of here in such a way as + * to leave dangling pointrs. + */ + if (uploaded_manifest_mcxt != NULL) + MemoryContextDelete(uploaded_manifest_mcxt); + MemoryContextSetParent(mcxt, CacheMemoryContext); + uploaded_manifest = ib; + uploaded_manifest_mcxt = mcxt; + + /* clean up the resource owner we created */ + WalSndResourceCleanup(true); +} + +/* + * Process one packet received during the handling of an UPLOAD_MANIFEST + * operation. + * + * 'buf' is scratch space. This function expects it to be initialized, doesn't + * care what the current contents are, and may override them with completely + * new contents. + * + * The return value is true if the caller should continue processing + * additional packets and false if the UPLOAD_MANIFEST operation is complete. + */ +static bool +HandleUploadManifestPacket(StringInfo buf, off_t *offset, + IncrementalBackupInfo *ib) +{ + int mtype; + int maxmsglen; + + HOLD_CANCEL_INTERRUPTS(); + + pq_startmsgread(); + mtype = pq_getbyte(); + if (mtype == EOF) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("unexpected EOF on client connection with an open transaction"))); + + switch (mtype) + { + case 'd': /* CopyData */ + maxmsglen = PQ_LARGE_MESSAGE_LIMIT; + break; + case 'c': /* CopyDone */ + case 'f': /* CopyFail */ + case 'H': /* Flush */ + case 'S': /* Sync */ + maxmsglen = PQ_SMALL_MESSAGE_LIMIT; + break; + default: + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("unexpected message type 0x%02X during COPY from stdin", + mtype))); + maxmsglen = 0; /* keep compiler quiet */ + break; + } + + /* Now collect the message body */ + if (pq_getmessage(buf, maxmsglen)) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("unexpected EOF on client connection with an open transaction"))); + RESUME_CANCEL_INTERRUPTS(); + + /* Process the message */ + switch (mtype) + { + case 'd': /* CopyData */ + AppendIncrementalManifestData(ib, buf->data, buf->len); + return true; + + case 'c': /* CopyDone */ + return false; + + case 'H': /* Sync */ + case 'S': /* Flush */ + /* Ignore these while in CopyOut mode as we do elsewhere. */ + return true; + + case 'f': + ereport(ERROR, + (errcode(ERRCODE_QUERY_CANCELED), + errmsg("COPY from stdin failed: %s", + pq_getmsgstring(buf)))); + } + + /* Not reached. */ + Assert(false); + return false; +} + /* * Handle START_REPLICATION command. * @@ -1801,7 +1953,7 @@ exec_replication_command(const char *cmd_string) cmdtag = "BASE_BACKUP"; set_ps_display(cmdtag); PreventInTransactionBlock(true, cmdtag); - SendBaseBackup((BaseBackupCmd *) cmd_node); + SendBaseBackup((BaseBackupCmd *) cmd_node, uploaded_manifest); EndReplicationCommand(cmdtag); break; @@ -1863,6 +2015,14 @@ exec_replication_command(const char *cmd_string) } break; + case T_UploadManifestCmd: + cmdtag = "UPLOAD_MANIFEST"; + set_ps_display(cmdtag); + PreventInTransactionBlock(true, cmdtag); + UploadManifest(); + EndReplicationCommand(cmdtag); + break; + default: elog(ERROR, "unrecognized replication command node tag: %u", cmd_node->type); diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 0e0ac22bdd675..706140eb9f486 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -32,6 +32,7 @@ #include "postmaster/bgworker_internals.h" #include "postmaster/bgwriter.h" #include "postmaster/postmaster.h" +#include "postmaster/walsummarizer.h" #include "replication/logicallauncher.h" #include "replication/origin.h" #include "replication/slot.h" @@ -140,6 +141,7 @@ CalculateShmemSize(int *num_semaphores) size = add_size(size, ReplicationOriginShmemSize()); size = add_size(size, WalSndShmemSize()); size = add_size(size, WalRcvShmemSize()); + size = add_size(size, WalSummarizerShmemSize()); size = add_size(size, PgArchShmemSize()); size = add_size(size, ApplyLauncherShmemSize()); size = add_size(size, BTreeShmemSize()); @@ -337,6 +339,7 @@ CreateOrAttachShmemStructs(void) ReplicationOriginShmemInit(); WalSndShmemInit(); WalRcvShmemInit(); + WalSummarizerShmemInit(); PgArchShmemInit(); ApplyLauncherShmemInit(); diff --git a/src/bin/Makefile b/src/bin/Makefile index 373077bf52b2e..aa2210925e28f 100644 --- a/src/bin/Makefile +++ b/src/bin/Makefile @@ -19,6 +19,7 @@ SUBDIRS = \ pg_archivecleanup \ pg_basebackup \ pg_checksums \ + pg_combinebackup \ pg_config \ pg_controldata \ pg_ctl \ diff --git a/src/bin/meson.build b/src/bin/meson.build index 67cb50630c5a3..4cb6fd59bb876 100644 --- a/src/bin/meson.build +++ b/src/bin/meson.build @@ -5,6 +5,7 @@ subdir('pg_amcheck') subdir('pg_archivecleanup') subdir('pg_basebackup') subdir('pg_checksums') +subdir('pg_combinebackup') subdir('pg_config') subdir('pg_controldata') subdir('pg_ctl') diff --git a/src/bin/pg_basebackup/bbstreamer_file.c b/src/bin/pg_basebackup/bbstreamer_file.c index 45f32974ff6e3..6b78ee283d97d 100644 --- a/src/bin/pg_basebackup/bbstreamer_file.c +++ b/src/bin/pg_basebackup/bbstreamer_file.c @@ -296,6 +296,7 @@ should_allow_existing_directory(const char *pathname) if (strcmp(filename, "pg_wal") == 0 || strcmp(filename, "pg_xlog") == 0 || strcmp(filename, "archive_status") == 0 || + strcmp(filename, "summaries") == 0 || strcmp(filename, "pg_tblspc") == 0) return true; diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c index f32684a8f2338..5795b91261fce 100644 --- a/src/bin/pg_basebackup/pg_basebackup.c +++ b/src/bin/pg_basebackup/pg_basebackup.c @@ -101,6 +101,11 @@ typedef void (*WriteDataCallback) (size_t nbytes, char *buf, */ #define MINIMUM_VERSION_FOR_TERMINATED_TARFILE 150000 +/* + * pg_wal/summaries exists beginning with version 17. + */ +#define MINIMUM_VERSION_FOR_WAL_SUMMARIES 170000 + /* * Different ways to include WAL */ @@ -217,7 +222,8 @@ static void ReceiveBackupManifestInMemoryChunk(size_t r, char *copybuf, void *callback_data); static void BaseBackup(char *compression_algorithm, char *compression_detail, CompressionLocation compressloc, - pg_compress_specification *client_compress); + pg_compress_specification *client_compress, + char *incremental_manifest); static bool reached_end_position(XLogRecPtr segendpos, uint32 timeline, bool segment_finished); @@ -390,6 +396,8 @@ usage(void) printf(_("\nOptions controlling the output:\n")); printf(_(" -D, --pgdata=DIRECTORY receive base backup into directory\n")); printf(_(" -F, --format=p|t output format (plain (default), tar)\n")); + printf(_(" -i, --incremental=OLDMANIFEST\n")); + printf(_(" take incremental backup\n")); printf(_(" -r, --max-rate=RATE maximum transfer rate to transfer data directory\n" " (in kB/s, or use suffix \"k\" or \"M\")\n")); printf(_(" -R, --write-recovery-conf\n" @@ -688,6 +696,23 @@ StartLogStreamer(char *startpos, uint32 timeline, char *sysidentifier, if (pg_mkdir_p(statusdir, pg_dir_create_mode) != 0 && errno != EEXIST) pg_fatal("could not create directory \"%s\": %m", statusdir); + + /* + * For newer server versions, likewise create pg_wal/summaries + */ + if (PQserverVersion(conn) < MINIMUM_VERSION_FOR_WAL_SUMMARIES) + { + char summarydir[MAXPGPATH]; + + snprintf(summarydir, sizeof(summarydir), "%s/%s/summaries", + basedir, + PQserverVersion(conn) < MINIMUM_VERSION_FOR_PG_WAL ? + "pg_xlog" : "pg_wal"); + + if (pg_mkdir_p(statusdir, pg_dir_create_mode) != 0 && + errno != EEXIST) + pg_fatal("could not create directory \"%s\": %m", summarydir); + } } /* @@ -1728,7 +1753,9 @@ ReceiveBackupManifestInMemoryChunk(size_t r, char *copybuf, static void BaseBackup(char *compression_algorithm, char *compression_detail, - CompressionLocation compressloc, pg_compress_specification *client_compress) + CompressionLocation compressloc, + pg_compress_specification *client_compress, + char *incremental_manifest) { PGresult *res; char *sysidentifier; @@ -1794,7 +1821,76 @@ BaseBackup(char *compression_algorithm, char *compression_detail, exit(1); /* - * Start the actual backup + * If the user wants an incremental backup, we must upload the manifest + * for the previous backup upon which it is to be based. + */ + if (incremental_manifest != NULL) + { + int fd; + char mbuf[65536]; + int nbytes; + + /* Reject if server is too old. */ + if (serverVersion < MINIMUM_VERSION_FOR_WAL_SUMMARIES) + pg_fatal("server does not support incremental backup"); + + /* Open the file. */ + fd = open(incremental_manifest, O_RDONLY | PG_BINARY, 0); + if (fd < 0) + pg_fatal("could not open file \"%s\": %m", incremental_manifest); + + /* Tell the server what we want to do. */ + if (PQsendQuery(conn, "UPLOAD_MANIFEST") == 0) + pg_fatal("could not send replication command \"%s\": %s", + "UPLOAD_MANIFEST", PQerrorMessage(conn)); + res = PQgetResult(conn); + if (PQresultStatus(res) != PGRES_COPY_IN) + { + if (PQresultStatus(res) == PGRES_FATAL_ERROR) + pg_fatal("could not upload manifest: %s", + PQerrorMessage(conn)); + else + pg_fatal("could not upload manifest: unexpected status %s", + PQresStatus(PQresultStatus(res))); + } + + /* Loop, reading from the file and sending the data to the server. */ + while ((nbytes = read(fd, mbuf, sizeof mbuf)) > 0) + { + if (PQputCopyData(conn, mbuf, nbytes) < 0) + pg_fatal("could not send COPY data: %s", + PQerrorMessage(conn)); + } + + /* Bail out if we exited the loop due to an error. */ + if (nbytes < 0) + pg_fatal("could not read file \"%s\": %m", incremental_manifest); + + /* End the COPY operation. */ + if (PQputCopyEnd(conn, NULL) < 0) + pg_fatal("could not send end-of-COPY: %s", + PQerrorMessage(conn)); + + /* See whether the server is happy with what we sent. */ + res = PQgetResult(conn); + if (PQresultStatus(res) == PGRES_FATAL_ERROR) + pg_fatal("could not upload manifest: %s", + PQerrorMessage(conn)); + else if (PQresultStatus(res) != PGRES_COMMAND_OK) + pg_fatal("could not upload manifest: unexpected status %s", + PQresStatus(PQresultStatus(res))); + + /* Consume ReadyForQuery message from server. */ + res = PQgetResult(conn); + if (res != NULL) + pg_fatal("unexpected extra result while sending manifest"); + + /* Add INCREMENTAL option to BASE_BACKUP command. */ + AppendPlainCommandOption(&buf, use_new_option_syntax, "INCREMENTAL"); + } + + /* + * Continue building up the options list for the BASE_BACKUP command. */ AppendStringCommandOption(&buf, use_new_option_syntax, "LABEL", label); if (estimatesize) @@ -1901,6 +1997,7 @@ BaseBackup(char *compression_algorithm, char *compression_detail, else basebkp = psprintf("BASE_BACKUP %s", buf.data); + /* OK, try to start the backup. */ if (PQsendQuery(conn, basebkp) == 0) pg_fatal("could not send replication command \"%s\": %s", "BASE_BACKUP", PQerrorMessage(conn)); @@ -2256,6 +2353,7 @@ main(int argc, char **argv) {"version", no_argument, NULL, 'V'}, {"pgdata", required_argument, NULL, 'D'}, {"format", required_argument, NULL, 'F'}, + {"incremental", required_argument, NULL, 'i'}, {"checkpoint", required_argument, NULL, 'c'}, {"create-slot", no_argument, NULL, 'C'}, {"max-rate", required_argument, NULL, 'r'}, @@ -2293,6 +2391,7 @@ main(int argc, char **argv) int option_index; char *compression_algorithm = "none"; char *compression_detail = NULL; + char *incremental_manifest = NULL; CompressionLocation compressloc = COMPRESS_LOCATION_UNSPECIFIED; pg_compress_specification client_compress; @@ -2317,7 +2416,7 @@ main(int argc, char **argv) atexit(cleanup_directories_atexit); - while ((c = getopt_long(argc, argv, "c:Cd:D:F:h:l:nNp:Pr:Rs:S:t:T:U:vwWX:zZ:", + while ((c = getopt_long(argc, argv, "c:Cd:D:F:h:i:l:nNp:Pr:Rs:S:t:T:U:vwWX:zZ:", long_options, &option_index)) != -1) { switch (c) @@ -2352,6 +2451,9 @@ main(int argc, char **argv) case 'h': dbhost = pg_strdup(optarg); break; + case 'i': + incremental_manifest = pg_strdup(optarg); + break; case 'l': label = pg_strdup(optarg); break; @@ -2765,7 +2867,7 @@ main(int argc, char **argv) } BaseBackup(compression_algorithm, compression_detail, compressloc, - &client_compress); + &client_compress, incremental_manifest); success = true; return 0; diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl index b9f5e1266b424..bf765291e7d5a 100644 --- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl +++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl @@ -223,10 +223,10 @@ "check backup dir permissions"); } -# Only archive_status directory should be copied in pg_wal/. +# Only archive_status and summaries directories should be copied in pg_wal/. is_deeply( [ sort(slurp_dir("$tempdir/backup/pg_wal/")) ], - [ sort qw(. .. archive_status) ], + [ sort qw(. .. archive_status summaries) ], 'no WAL files copied'); # Contents of these directories should not be copied. diff --git a/src/bin/pg_combinebackup/.gitignore b/src/bin/pg_combinebackup/.gitignore new file mode 100644 index 0000000000000..d7e617438c473 --- /dev/null +++ b/src/bin/pg_combinebackup/.gitignore @@ -0,0 +1 @@ +pg_combinebackup diff --git a/src/bin/pg_combinebackup/Makefile b/src/bin/pg_combinebackup/Makefile new file mode 100644 index 0000000000000..78ba05e624ba1 --- /dev/null +++ b/src/bin/pg_combinebackup/Makefile @@ -0,0 +1,52 @@ +#------------------------------------------------------------------------- +# +# Makefile for src/bin/pg_combinebackup +# +# Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group +# Portions Copyright (c) 1994, Regents of the University of California +# +# src/bin/pg_combinebackup/Makefile +# +#------------------------------------------------------------------------- + +PGFILEDESC = "pg_combinebackup - combine incremental backups" +PGAPPICON=win32 + +subdir = src/bin/pg_combinebackup +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +override CPPFLAGS := -I$(libpq_srcdir) $(CPPFLAGS) +LDFLAGS_INTERNAL += -L$(top_builddir)/src/fe_utils -lpgfeutils + +OBJS = \ + $(WIN32RES) \ + pg_combinebackup.o \ + backup_label.o \ + copy_file.o \ + load_manifest.o \ + reconstruct.o \ + write_manifest.o + +all: pg_combinebackup + +pg_combinebackup: $(OBJS) | submake-libpgport submake-libpgfeutils + $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@$(X) + +install: all installdirs + $(INSTALL_PROGRAM) pg_combinebackup$(X) '$(DESTDIR)$(bindir)/pg_combinebackup$(X)' + +installdirs: + $(MKDIR_P) '$(DESTDIR)$(bindir)' + +uninstall: + rm -f '$(DESTDIR)$(bindir)/pg_combinebackup$(X)' + +clean distclean maintainer-clean: + rm -f pg_combinebackup$(X) $(OBJS) + +check: + $(prove_check) + +installcheck: + $(prove_installcheck) diff --git a/src/bin/pg_combinebackup/backup_label.c b/src/bin/pg_combinebackup/backup_label.c new file mode 100644 index 0000000000000..922e00854d678 --- /dev/null +++ b/src/bin/pg_combinebackup/backup_label.c @@ -0,0 +1,283 @@ +/*------------------------------------------------------------------------- + * + * Read and manipulate backup label files + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/bin/pg_combinebackup/backup_label.c + * + *------------------------------------------------------------------------- + */ +#include "postgres_fe.h" + +#include + +#include "access/xlogdefs.h" +#include "backup_label.h" +#include "common/logging.h" +#include "common/file_perm.h" +#include "write_manifest.h" + +static int get_eol_offset(StringInfo buf); +static bool line_starts_with(char *s, char *e, char *match, char **sout); +static bool parse_lsn(char *s, char *e, XLogRecPtr *lsn, char **c); +static bool parse_tli(char *s, char *e, TimeLineID *tli); + +/* + * Parse a backup label file, starting at buf->cursor. + * + * We expect to find a START WAL LOCATION line, followed by a LSN, followed + * by a space; the resulting LSN is stored into *start_lsn. + * + * We expect to find a START TIMELINE line, followed by a TLI, followed by + * a newline; the resulting TLI is stored into *start_tli. + * + * We expect to find either both INCREMENTAL FROM LSN and INCREMENTAL FROM TLI + * or neither. If these are found, they should be followed by an LSN or TLI + * respectively and then by a newline, and the values will be stored into + * *previous_lsn and *previous_tli, respectively. + * + * Other lines in the provided backup_label data are ignored. filename is used + * for error reporting; errors are fatal. + */ +void +parse_backup_label(char *filename, StringInfo buf, + TimeLineID *start_tli, XLogRecPtr *start_lsn, + TimeLineID *previous_tli, XLogRecPtr *previous_lsn) +{ + int found = 0; + + *start_tli = 0; + *start_lsn = InvalidXLogRecPtr; + *previous_tli = 0; + *previous_lsn = InvalidXLogRecPtr; + + while (buf->cursor < buf->len) + { + char *s = &buf->data[buf->cursor]; + int eo = get_eol_offset(buf); + char *e = &buf->data[eo]; + char *c; + + if (line_starts_with(s, e, "START WAL LOCATION: ", &s)) + { + if (!parse_lsn(s, e, start_lsn, &c)) + pg_fatal("%s: could not parse %s", + filename, "START WAL LOCATION"); + if (c >= e || *c != ' ') + pg_fatal("%s: improper terminator for %s", + filename, "START WAL LOCATION"); + found |= 1; + } + else if (line_starts_with(s, e, "START TIMELINE: ", &s)) + { + if (!parse_tli(s, e, start_tli)) + pg_fatal("%s: could not parse TLI for %s", + filename, "START TIMELINE"); + if (*start_tli == 0) + pg_fatal("%s: invalid TLI", filename); + found |= 2; + } + else if (line_starts_with(s, e, "INCREMENTAL FROM LSN: ", &s)) + { + if (!parse_lsn(s, e, previous_lsn, &c)) + pg_fatal("%s: could not parse %s", + filename, "INCREMENTAL FROM LSN"); + if (c >= e || *c != '\n') + pg_fatal("%s: improper terminator for %s", + filename, "INCREMENTAL FROM LSN"); + found |= 4; + } + else if (line_starts_with(s, e, "INCREMENTAL FROM TLI: ", &s)) + { + if (!parse_tli(s, e, previous_tli)) + pg_fatal("%s: could not parse %s", + filename, "INCREMENTAL FROM TLI"); + if (*previous_tli == 0) + pg_fatal("%s: invalid TLI", filename); + found |= 8; + } + + buf->cursor = eo; + } + + if ((found & 1) == 0) + pg_fatal("%s: could not find %s", filename, "START WAL LOCATION"); + if ((found & 2) == 0) + pg_fatal("%s: could not find %s", filename, "START TIMELINE"); + if ((found & 4) != 0 && (found & 8) == 0) + pg_fatal("%s: %s requires %s", filename, + "INCREMENTAL FROM LSN", "INCREMENTAL FROM TLI"); + if ((found & 8) != 0 && (found & 4) == 0) + pg_fatal("%s: %s requires %s", filename, + "INCREMENTAL FROM TLI", "INCREMENTAL FROM LSN"); +} + +/* + * Write a backup label file to the output directory. + * + * This will be identical to the provided backup_label file, except that the + * INCREMENTAL FROM LSN and INCREMENTAL FROM TLI lines will be omitted. + * + * The new file will be checksummed using the specified algorithm. If + * mwriter != NULL, it will be added to the manifest. + */ +void +write_backup_label(char *output_directory, StringInfo buf, + pg_checksum_type checksum_type, manifest_writer *mwriter) +{ + char output_filename[MAXPGPATH]; + int output_fd; + pg_checksum_context checksum_ctx; + uint8 checksum_payload[PG_CHECKSUM_MAX_LENGTH]; + int checksum_length; + + pg_checksum_init(&checksum_ctx, checksum_type); + + snprintf(output_filename, MAXPGPATH, "%s/backup_label", output_directory); + + if ((output_fd = open(output_filename, + O_WRONLY | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + pg_fatal("could not open file \"%s\": %m", output_filename); + + while (buf->cursor < buf->len) + { + char *s = &buf->data[buf->cursor]; + int eo = get_eol_offset(buf); + char *e = &buf->data[eo]; + + if (!line_starts_with(s, e, "INCREMENTAL FROM LSN: ", NULL) && + !line_starts_with(s, e, "INCREMENTAL FROM TLI: ", NULL)) + { + ssize_t wb; + + wb = write(output_fd, s, e - s); + if (wb != e - s) + { + if (wb < 0) + pg_fatal("could not write file \"%s\": %m", output_filename); + else + pg_fatal("could not write file \"%s\": wrote only %d of %d bytes", + output_filename, (int) wb, (int) (e - s)); + } + if (pg_checksum_update(&checksum_ctx, (uint8 *) s, e - s) < 0) + pg_fatal("could not update checksum of file \"%s\"", + output_filename); + } + + buf->cursor = eo; + } + + if (close(output_fd) != 0) + pg_fatal("could not close \"%s\": %m", output_filename); + + checksum_length = pg_checksum_final(&checksum_ctx, checksum_payload); + + if (mwriter != NULL) + { + struct stat sb; + + /* + * We could track the length ourselves, but must stat() to get the + * mtime. + */ + if (stat(output_filename, &sb) < 0) + pg_fatal("could not stat file \"%s\": %m", output_filename); + add_file_to_manifest(mwriter, "backup_label", sb.st_size, + sb.st_mtime, checksum_type, + checksum_length, checksum_payload); + } +} + +/* + * Return the offset at which the next line in the buffer starts, or there + * is none, the offset at which the buffer ends. + * + * The search begins at buf->cursor. + */ +static int +get_eol_offset(StringInfo buf) +{ + int eo = buf->cursor; + + while (eo < buf->len) + { + if (buf->data[eo] == '\n') + return eo + 1; + ++eo; + } + + return eo; +} + +/* + * Test whether the line that runs from s to e (inclusive of *s, but not + * inclusive of *e) starts with the match string provided, and return true + * or false according to whether or not this is the case. + * + * If the function returns true and if *sout != NULL, stores a pointer to the + * byte following the match into *sout. + */ +static bool +line_starts_with(char *s, char *e, char *match, char **sout) +{ + while (s < e && *match != '\0' && *s == *match) + ++s, ++match; + + if (*match == '\0' && sout != NULL) + *sout = s; + + return (*match == '\0'); +} + +/* + * Parse an LSN starting at s and not stopping at or before e. The return value + * is true on success and otherwise false. On success, stores the result into + * *lsn and sets *c to the first character that is not part of the LSN. + */ +static bool +parse_lsn(char *s, char *e, XLogRecPtr *lsn, char **c) +{ + char save = *e; + int nchars; + bool success; + unsigned hi; + unsigned lo; + + *e = '\0'; + success = (sscanf(s, "%X/%X%n", &hi, &lo, &nchars) == 2); + *e = save; + + if (success) + { + *lsn = ((XLogRecPtr) hi) << 32 | (XLogRecPtr) lo; + *c = s + nchars; + } + + return success; +} + +/* + * Parse a TLI starting at s and stopping at or before e. The return value is + * true on success and otherwise false. On success, stores the result into + * *tli. If the first character that is not part of the TLI is anything other + * than a newline, that is deemed a failure. + */ +static bool +parse_tli(char *s, char *e, TimeLineID *tli) +{ + char save = *e; + int nchars; + bool success; + + *e = '\0'; + success = (sscanf(s, "%u%n", tli, &nchars) == 1); + *e = save; + + if (success && s[nchars] != '\n') + success = false; + + return success; +} diff --git a/src/bin/pg_combinebackup/backup_label.h b/src/bin/pg_combinebackup/backup_label.h new file mode 100644 index 0000000000000..3af7ea274ccb2 --- /dev/null +++ b/src/bin/pg_combinebackup/backup_label.h @@ -0,0 +1,30 @@ +/*------------------------------------------------------------------------- + * + * Read and manipulate backup label files + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/bin/pg_combinebackup/backup_label.h + * + *------------------------------------------------------------------------- + */ +#ifndef BACKUP_LABEL_H +#define BACKUP_LABEL_H + +#include "access/xlogdefs.h" +#include "common/checksum_helper.h" +#include "lib/stringinfo.h" + +struct manifest_writer; + +extern void parse_backup_label(char *filename, StringInfo buf, + TimeLineID *start_tli, + XLogRecPtr *start_lsn, + TimeLineID *previous_tli, + XLogRecPtr *previous_lsn); +extern void write_backup_label(char *output_directory, StringInfo buf, + pg_checksum_type checksum_type, + struct manifest_writer *mwriter); + +#endif /* BACKUP_LABEL_H */ diff --git a/src/bin/pg_combinebackup/copy_file.c b/src/bin/pg_combinebackup/copy_file.c new file mode 100644 index 0000000000000..40a55e3087846 --- /dev/null +++ b/src/bin/pg_combinebackup/copy_file.c @@ -0,0 +1,169 @@ +/* + * Copy entire files. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/bin/pg_combinebackup/copy_file.h + * + *------------------------------------------------------------------------- + */ +#include "postgres_fe.h" + +#ifdef HAVE_COPYFILE_H +#include +#endif +#include +#include +#include + +#include "common/file_perm.h" +#include "common/logging.h" +#include "copy_file.h" + +static void copy_file_blocks(const char *src, const char *dst, + pg_checksum_context *checksum_ctx); + +#ifdef WIN32 +static void copy_file_copyfile(const char *src, const char *dst); +#endif + +/* + * Copy a regular file, optionally computing a checksum, and emitting + * appropriate debug messages. But if we're in dry-run mode, then just emit + * the messages and don't copy anything. + */ +void +copy_file(const char *src, const char *dst, + pg_checksum_context *checksum_ctx, bool dry_run) +{ + /* + * In dry-run mode, we don't actually copy anything, nor do we read any + * data from the source file, but we do verify that we can open it. + */ + if (dry_run) + { + int fd; + + if ((fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("could not open \"%s\": %m", src); + if (close(fd) < 0) + pg_fatal("could not close \"%s\": %m", src); + } + + /* + * If we don't need to compute a checksum, then we can use any special + * operating system primitives that we know about to copy the file; this + * may be quicker than a naive block copy. + */ + if (checksum_ctx->type == CHECKSUM_TYPE_NONE) + { + char *strategy_name = NULL; + void (*strategy_implementation) (const char *, const char *) = NULL; + +#ifdef WIN32 + strategy_name = "CopyFile"; + strategy_implementation = copy_file_copyfile; +#endif + + if (strategy_name != NULL) + { + if (dry_run) + pg_log_debug("would copy \"%s\" to \"%s\" using strategy %s", + src, dst, strategy_name); + else + { + pg_log_debug("copying \"%s\" to \"%s\" using strategy %s", + src, dst, strategy_name); + (*strategy_implementation) (src, dst); + } + return; + } + } + + /* + * Fall back to the simple approach of reading and writing all the blocks, + * feeding them into the checksum context as we go. + */ + if (dry_run) + { + if (checksum_ctx->type == CHECKSUM_TYPE_NONE) + pg_log_debug("would copy \"%s\" to \"%s\"", + src, dst); + else + pg_log_debug("would copy \"%s\" to \"%s\" and checksum with %s", + src, dst, pg_checksum_type_name(checksum_ctx->type)); + } + else + { + if (checksum_ctx->type == CHECKSUM_TYPE_NONE) + pg_log_debug("copying \"%s\" to \"%s\"", + src, dst); + else + pg_log_debug("copying \"%s\" to \"%s\" and checksumming with %s", + src, dst, pg_checksum_type_name(checksum_ctx->type)); + copy_file_blocks(src, dst, checksum_ctx); + } +} + +/* + * Copy a file block by block, and optionally compute a checksum as we go. + */ +static void +copy_file_blocks(const char *src, const char *dst, + pg_checksum_context *checksum_ctx) +{ + int src_fd; + int dest_fd; + uint8 *buffer; + const int buffer_size = 50 * BLCKSZ; + ssize_t rb; + unsigned offset = 0; + + if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("could not open file \"%s\": %m", src); + + if ((dest_fd = open(dst, O_WRONLY | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + pg_fatal("could not open file \"%s\": %m", dst); + + buffer = pg_malloc(buffer_size); + + while ((rb = read(src_fd, buffer, buffer_size)) > 0) + { + ssize_t wb; + + if ((wb = write(dest_fd, buffer, rb)) != rb) + { + if (wb < 0) + pg_fatal("could not write file \"%s\": %m", dst); + else + pg_fatal("could not write file \"%s\": wrote only %d of %d bytes at offset %u", + dst, (int) wb, (int) rb, offset); + } + + if (pg_checksum_update(checksum_ctx, buffer, rb) < 0) + pg_fatal("could not update checksum of file \"%s\"", dst); + + offset += rb; + } + + if (rb < 0) + pg_fatal("could not read file \"%s\": %m", dst); + + pg_free(buffer); + close(src_fd); + close(dest_fd); +} + +#ifdef WIN32 +static void +copy_file_copyfile(const char *src, const char *dst) +{ + if (CopyFile(src, dst, true) == 0) + { + _dosmaperr(GetLastError()); + pg_fatal("could not copy \"%s\" to \"%s\": %m", src, dst); + } +} +#endif /* WIN32 */ diff --git a/src/bin/pg_combinebackup/copy_file.h b/src/bin/pg_combinebackup/copy_file.h new file mode 100644 index 0000000000000..031030bacbfba --- /dev/null +++ b/src/bin/pg_combinebackup/copy_file.h @@ -0,0 +1,19 @@ +/* + * Copy entire files. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/bin/pg_combinebackup/copy_file.h + * + *------------------------------------------------------------------------- + */ +#ifndef COPY_FILE_H +#define COPY_FILE_H + +#include "common/checksum_helper.h" + +extern void copy_file(const char *src, const char *dst, + pg_checksum_context *checksum_ctx, bool dry_run); + +#endif /* COPY_FILE_H */ diff --git a/src/bin/pg_combinebackup/load_manifest.c b/src/bin/pg_combinebackup/load_manifest.c new file mode 100644 index 0000000000000..ad32323c9c2b1 --- /dev/null +++ b/src/bin/pg_combinebackup/load_manifest.c @@ -0,0 +1,245 @@ +/*------------------------------------------------------------------------- + * + * Load data from a backup manifest into memory. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/bin/pg_combinebackup/load_manifest.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres_fe.h" + +#include +#include + +#include "common/hashfn.h" +#include "common/logging.h" +#include "common/parse_manifest.h" +#include "load_manifest.h" + +/* + * For efficiency, we'd like our hash table containing information about the + * manifest to start out with approximately the correct number of entries. + * There's no way to know the exact number of entries without reading the whole + * file, but we can get an estimate by dividing the file size by the estimated + * number of bytes per line. + * + * This could be off by about a factor of two in either direction, because the + * checksum algorithm has a big impact on the line lengths; e.g. a SHA512 + * checksum is 128 hex bytes, whereas a CRC-32C value is only 8, and there + * might be no checksum at all. + */ +#define ESTIMATED_BYTES_PER_MANIFEST_LINE 100 + +/* + * Define a hash table which we can use to store information about the files + * mentioned in the backup manifest. + */ +static uint32 hash_string_pointer(char *s); +#define SH_PREFIX manifest_files +#define SH_ELEMENT_TYPE manifest_file +#define SH_KEY_TYPE char * +#define SH_KEY pathname +#define SH_HASH_KEY(tb, key) hash_string_pointer(key) +#define SH_EQUAL(tb, a, b) (strcmp(a, b) == 0) +#define SH_SCOPE extern +#define SH_RAW_ALLOCATOR pg_malloc0 +#define SH_DEFINE +#include "lib/simplehash.h" + +static void combinebackup_per_file_cb(JsonManifestParseContext *context, + char *pathname, size_t size, + pg_checksum_type checksum_type, + int checksum_length, + uint8 *checksum_payload); +static void combinebackup_per_wal_range_cb(JsonManifestParseContext *context, + TimeLineID tli, + XLogRecPtr start_lsn, + XLogRecPtr end_lsn); +static void report_manifest_error(JsonManifestParseContext *context, + const char *fmt,...) + pg_attribute_printf(2, 3) pg_attribute_noreturn(); + +/* + * Load backup_manifest files from an array of backups and produces an array + * of manifest_data objects. + * + * NB: Since load_backup_manifest() can return NULL, the resulting array could + * contain NULL entries. + */ +manifest_data ** +load_backup_manifests(int n_backups, char **backup_directories) +{ + manifest_data **result; + int i; + + result = pg_malloc(sizeof(manifest_data *) * n_backups); + for (i = 0; i < n_backups; ++i) + result[i] = load_backup_manifest(backup_directories[i]); + + return result; +} + +/* + * Parse the backup_manifest file in the named backup directory. Construct a + * hash table with information about all the files it mentions, and a linked + * list of all the WAL ranges it mentions. + * + * If the backup_manifest file simply doesn't exist, logs a warning and returns + * NULL. Any other error, or any error parsing the contents of the file, is + * fatal. + */ +manifest_data * +load_backup_manifest(char *backup_directory) +{ + char pathname[MAXPGPATH]; + int fd; + struct stat statbuf; + off_t estimate; + uint32 initial_size; + manifest_files_hash *ht; + char *buffer; + int rc; + JsonManifestParseContext context; + manifest_data *result; + + /* Open the manifest file. */ + snprintf(pathname, MAXPGPATH, "%s/backup_manifest", backup_directory); + if ((fd = open(pathname, O_RDONLY | PG_BINARY, 0)) < 0) + { + if (errno == ENOENT) + { + pg_log_warning("\"%s\" does not exist", pathname); + return NULL; + } + pg_fatal("could not open file \"%s\": %m", pathname); + } + + /* Figure out how big the manifest is. */ + if (fstat(fd, &statbuf) != 0) + pg_fatal("could not stat file \"%s\": %m", pathname); + + /* Guess how large to make the hash table based on the manifest size. */ + estimate = statbuf.st_size / ESTIMATED_BYTES_PER_MANIFEST_LINE; + initial_size = Min(PG_UINT32_MAX, Max(estimate, 256)); + + /* Create the hash table. */ + ht = manifest_files_create(initial_size, NULL); + + /* + * Slurp in the whole file. + * + * This is not ideal, but there's currently no way to get pg_parse_json() + * to perform incremental parsing. + */ + buffer = pg_malloc(statbuf.st_size); + rc = read(fd, buffer, statbuf.st_size); + if (rc != statbuf.st_size) + { + if (rc < 0) + pg_fatal("could not read file \"%s\": %m", pathname); + else + pg_fatal("could not read file \"%s\": read %d of %lld", + pathname, rc, (long long int) statbuf.st_size); + } + + /* Close the manifest file. */ + close(fd); + + /* Parse the manifest. */ + result = pg_malloc0(sizeof(manifest_data)); + result->files = ht; + context.private_data = result; + context.per_file_cb = combinebackup_per_file_cb; + context.per_wal_range_cb = combinebackup_per_wal_range_cb; + context.error_cb = report_manifest_error; + json_parse_manifest(&context, buffer, statbuf.st_size); + + /* All done. */ + pfree(buffer); + return result; +} + +/* + * Report an error while parsing the manifest. + * + * We consider all such errors to be fatal errors. The manifest parser + * expects this function not to return. + */ +static void +report_manifest_error(JsonManifestParseContext *context, const char *fmt,...) +{ + va_list ap; + + va_start(ap, fmt); + pg_log_generic_v(PG_LOG_ERROR, PG_LOG_PRIMARY, gettext(fmt), ap); + va_end(ap); + + exit(1); +} + +/* + * Record details extracted from the backup manifest for one file. + */ +static void +combinebackup_per_file_cb(JsonManifestParseContext *context, + char *pathname, size_t size, + pg_checksum_type checksum_type, + int checksum_length, uint8 *checksum_payload) +{ + manifest_data *manifest = context->private_data; + manifest_file *m; + bool found; + + /* Make a new entry in the hash table for this file. */ + m = manifest_files_insert(manifest->files, pathname, &found); + if (found) + pg_fatal("duplicate path name in backup manifest: \"%s\"", pathname); + + /* Initialize the entry. */ + m->size = size; + m->checksum_type = checksum_type; + m->checksum_length = checksum_length; + m->checksum_payload = checksum_payload; +} + +/* + * Record details extracted from the backup manifest for one WAL range. + */ +static void +combinebackup_per_wal_range_cb(JsonManifestParseContext *context, + TimeLineID tli, + XLogRecPtr start_lsn, XLogRecPtr end_lsn) +{ + manifest_data *manifest = context->private_data; + manifest_wal_range *range; + + /* Allocate and initialize a struct describing this WAL range. */ + range = palloc(sizeof(manifest_wal_range)); + range->tli = tli; + range->start_lsn = start_lsn; + range->end_lsn = end_lsn; + range->prev = manifest->last_wal_range; + range->next = NULL; + + /* Add it to the end of the list. */ + if (manifest->first_wal_range == NULL) + manifest->first_wal_range = range; + else + manifest->last_wal_range->next = range; + manifest->last_wal_range = range; +} + +/* + * Helper function for manifest_files hash table. + */ +static uint32 +hash_string_pointer(char *s) +{ + unsigned char *ss = (unsigned char *) s; + + return hash_bytes(ss, strlen(s)); +} diff --git a/src/bin/pg_combinebackup/load_manifest.h b/src/bin/pg_combinebackup/load_manifest.h new file mode 100644 index 0000000000000..2bfeeff156042 --- /dev/null +++ b/src/bin/pg_combinebackup/load_manifest.h @@ -0,0 +1,67 @@ +/*------------------------------------------------------------------------- + * + * Load data from a backup manifest into memory. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/bin/pg_combinebackup/load_manifest.h + * + *------------------------------------------------------------------------- + */ +#ifndef LOAD_MANIFEST_H +#define LOAD_MANIFEST_H + +#include "access/xlogdefs.h" +#include "common/checksum_helper.h" + +/* + * Each file described by the manifest file is parsed to produce an object + * like this. + */ +typedef struct manifest_file +{ + uint32 status; /* hash status */ + char *pathname; + size_t size; + pg_checksum_type checksum_type; + int checksum_length; + uint8 *checksum_payload; +} manifest_file; + +#define SH_PREFIX manifest_files +#define SH_ELEMENT_TYPE manifest_file +#define SH_KEY_TYPE char * +#define SH_SCOPE extern +#define SH_RAW_ALLOCATOR pg_malloc0 +#define SH_DECLARE +#include "lib/simplehash.h" + +/* + * Each WAL range described by the manifest file is parsed to produce an + * object like this. + */ +typedef struct manifest_wal_range +{ + TimeLineID tli; + XLogRecPtr start_lsn; + XLogRecPtr end_lsn; + struct manifest_wal_range *next; + struct manifest_wal_range *prev; +} manifest_wal_range; + +/* + * All the data parsed from a backup_manifest file. + */ +typedef struct manifest_data +{ + manifest_files_hash *files; + manifest_wal_range *first_wal_range; + manifest_wal_range *last_wal_range; +} manifest_data; + +extern manifest_data *load_backup_manifest(char *backup_directory); +extern manifest_data **load_backup_manifests(int n_backups, + char **backup_directories); + +#endif /* LOAD_MANIFEST_H */ diff --git a/src/bin/pg_combinebackup/meson.build b/src/bin/pg_combinebackup/meson.build new file mode 100644 index 0000000000000..e402d6f50e7d4 --- /dev/null +++ b/src/bin/pg_combinebackup/meson.build @@ -0,0 +1,38 @@ +# Copyright (c) 2022-2023, PostgreSQL Global Development Group + +pg_combinebackup_sources = files( + 'pg_combinebackup.c', + 'backup_label.c', + 'copy_file.c', + 'load_manifest.c', + 'reconstruct.c', + 'write_manifest.c', +) + +if host_system == 'windows' + pg_combinebackup_sources += rc_bin_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'pg_combinebackup', + '--FILEDESC', 'pg_combinebackup - combine incremental backups',]) +endif + +pg_combinebackup = executable('pg_combinebackup', + pg_combinebackup_sources, + dependencies: [frontend_code], + kwargs: default_bin_args, +) +bin_targets += pg_combinebackup + +tests += { + 'name': 'pg_combinebackup', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'tap': { + 'tests': [ + 't/001_basic.pl', + 't/002_compare_backups.pl', + 't/003_timeline.pl', + 't/004_manifest.pl', + 't/005_integrity.pl', + ], + } +} diff --git a/src/bin/pg_combinebackup/nls.mk b/src/bin/pg_combinebackup/nls.mk new file mode 100644 index 0000000000000..c8e59d1d00d4e --- /dev/null +++ b/src/bin/pg_combinebackup/nls.mk @@ -0,0 +1,11 @@ +# src/bin/pg_combinebackup/nls.mk +CATALOG_NAME = pg_combinebackup +GETTEXT_FILES = $(FRONTEND_COMMON_GETTEXT_FILES) \ + backup_label.c \ + copy_file.c \ + load_manifest.c \ + pg_combinebackup.c \ + reconstruct.c \ + write_manifest.c +GETTEXT_TRIGGERS = $(FRONTEND_COMMON_GETTEXT_TRIGGERS) +GETTEXT_FLAGS = $(FRONTEND_COMMON_GETTEXT_FLAGS) diff --git a/src/bin/pg_combinebackup/pg_combinebackup.c b/src/bin/pg_combinebackup/pg_combinebackup.c new file mode 100644 index 0000000000000..85d3f4e5def5b --- /dev/null +++ b/src/bin/pg_combinebackup/pg_combinebackup.c @@ -0,0 +1,1284 @@ +/*------------------------------------------------------------------------- + * + * pg_combinebackup.c + * Combine incremental backups with prior backups. + * + * Copyright (c) 2017-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/bin/pg_combinebackup/pg_combinebackup.c + * + *------------------------------------------------------------------------- + */ +#include "postgres_fe.h" + +#include +#include +#include + +#include "backup_label.h" +#include "common/blkreftable.h" +#include "common/checksum_helper.h" +#include "common/controldata_utils.h" +#include "common/file_perm.h" +#include "common/file_utils.h" +#include "common/logging.h" +#include "copy_file.h" +#include "fe_utils/option_utils.h" +#include "lib/stringinfo.h" +#include "load_manifest.h" +#include "getopt_long.h" +#include "reconstruct.h" +#include "write_manifest.h" + +/* Incremental file naming convention. */ +#define INCREMENTAL_PREFIX "INCREMENTAL." +#define INCREMENTAL_PREFIX_LENGTH (sizeof(INCREMENTAL_PREFIX) - 1) + +/* + * Tracking for directories that need to be removed, or have their contents + * removed, if the operation fails. + */ +typedef struct cb_cleanup_dir +{ + char *target_path; + bool rmtopdir; + struct cb_cleanup_dir *next; +} cb_cleanup_dir; + +/* + * Stores a tablespace mapping provided using -T, --tablespace-mapping. + */ +typedef struct cb_tablespace_mapping +{ + char old_dir[MAXPGPATH]; + char new_dir[MAXPGPATH]; + struct cb_tablespace_mapping *next; +} cb_tablespace_mapping; + +/* + * Stores data parsed from all command-line options. + */ +typedef struct cb_options +{ + bool debug; + char *output; + bool dry_run; + bool no_sync; + cb_tablespace_mapping *tsmappings; + pg_checksum_type manifest_checksums; + bool no_manifest; + DataDirSyncMethod sync_method; +} cb_options; + +/* + * Data about a tablespace. + * + * Every normal tablespace needs a tablespace mapping, but in-place tablespaces + * don't, so the list of tablespaces can contain more entries than the list of + * tablespace mappings. + */ +typedef struct cb_tablespace +{ + Oid oid; + bool in_place; + char old_dir[MAXPGPATH]; + char new_dir[MAXPGPATH]; + struct cb_tablespace *next; +} cb_tablespace; + +/* Directories to be removed if we exit uncleanly. */ +cb_cleanup_dir *cleanup_dir_list = NULL; + +static void add_tablespace_mapping(cb_options *opt, char *arg); +static StringInfo check_backup_label_files(int n_backups, char **backup_dirs); +static void check_control_files(int n_backups, char **backup_dirs); +static void check_input_dir_permissions(char *dir); +static void cleanup_directories_atexit(void); +static void create_output_directory(char *dirname, cb_options *opt); +static void help(const char *progname); +static bool parse_oid(char *s, Oid *result); +static void process_directory_recursively(Oid tsoid, + char *input_directory, + char *output_directory, + char *relative_path, + int n_prior_backups, + char **prior_backup_dirs, + manifest_data **manifests, + manifest_writer *mwriter, + cb_options *opt); +static int read_pg_version_file(char *directory); +static void remember_to_cleanup_directory(char *target_path, bool rmtopdir); +static void reset_directory_cleanup_list(void); +static cb_tablespace *scan_for_existing_tablespaces(char *pathname, + cb_options *opt); +static void slurp_file(int fd, char *filename, StringInfo buf, int maxlen); + +/* + * Main program. + */ +int +main(int argc, char *argv[]) +{ + static struct option long_options[] = { + {"debug", no_argument, NULL, 'd'}, + {"dry-run", no_argument, NULL, 'n'}, + {"no-sync", no_argument, NULL, 'N'}, + {"output", required_argument, NULL, 'o'}, + {"tablespace-mapping", no_argument, NULL, 'T'}, + {"manifest-checksums", required_argument, NULL, 1}, + {"no-manifest", no_argument, NULL, 2}, + {"sync-method", required_argument, NULL, 3}, + {NULL, 0, NULL, 0} + }; + + const char *progname; + char *last_input_dir; + int optindex; + int c; + int n_backups; + int n_prior_backups; + int version; + char **prior_backup_dirs; + cb_options opt; + cb_tablespace *tablespaces; + cb_tablespace *ts; + StringInfo last_backup_label; + manifest_data **manifests; + manifest_writer *mwriter; + + pg_logging_init(argv[0]); + progname = get_progname(argv[0]); + handle_help_version_opts(argc, argv, progname, help); + + memset(&opt, 0, sizeof(opt)); + opt.manifest_checksums = CHECKSUM_TYPE_CRC32C; + opt.sync_method = DATA_DIR_SYNC_METHOD_FSYNC; + + /* process command-line options */ + while ((c = getopt_long(argc, argv, "dnNPo:T:", + long_options, &optindex)) != -1) + { + switch (c) + { + case 'd': + opt.debug = true; + pg_logging_increase_verbosity(); + break; + case 'n': + opt.dry_run = true; + break; + case 'N': + opt.no_sync = true; + break; + case 'o': + opt.output = optarg; + break; + case 'T': + add_tablespace_mapping(&opt, optarg); + break; + case 1: + if (!pg_checksum_parse_type(optarg, + &opt.manifest_checksums)) + pg_fatal("unrecognized checksum algorithm: \"%s\"", + optarg); + break; + case 2: + opt.no_manifest = true; + break; + case 3: + if (!parse_sync_method(optarg, &opt.sync_method)) + exit(1); + break; + default: + /* getopt_long already emitted a complaint */ + pg_log_error_hint("Try \"%s --help\" for more information.", progname); + exit(1); + } + } + + if (optind >= argc) + { + pg_log_error("%s: no input directories specified", progname); + pg_log_error_hint("Try \"%s --help\" for more information.", progname); + exit(1); + } + + if (opt.output == NULL) + pg_fatal("no output directory specified"); + + /* If no manifest is needed, no checksums are needed, either. */ + if (opt.no_manifest) + opt.manifest_checksums = CHECKSUM_TYPE_NONE; + + /* Read the server version from the final backup. */ + version = read_pg_version_file(argv[argc - 1]); + + /* Sanity-check control files. */ + n_backups = argc - optind; + check_control_files(n_backups, argv + optind); + + /* Sanity-check backup_label files, and get the contents of the last one. */ + last_backup_label = check_backup_label_files(n_backups, argv + optind); + + /* + * We'll need the pathnames to the prior backups. By "prior" we mean all + * but the last one listed on the command line. + */ + n_prior_backups = argc - optind - 1; + prior_backup_dirs = argv + optind; + + /* Load backup manifests. */ + manifests = load_backup_manifests(n_backups, prior_backup_dirs); + + /* Figure out which tablespaces are going to be included in the output. */ + last_input_dir = argv[argc - 1]; + check_input_dir_permissions(last_input_dir); + tablespaces = scan_for_existing_tablespaces(last_input_dir, &opt); + + /* + * Create output directories. + * + * We create one output directory for the main data directory plus one for + * each non-in-place tablespace. create_output_directory() will arrange + * for those directories to be cleaned up on failure. In-place tablespaces + * aren't handled at this stage because they're located beneath the main + * output directory, and thus the cleanup of that directory will get rid + * of them. Plus, the pg_tblspc directory that needs to contain them + * doesn't exist yet. + */ + atexit(cleanup_directories_atexit); + create_output_directory(opt.output, &opt); + for (ts = tablespaces; ts != NULL; ts = ts->next) + if (!ts->in_place) + create_output_directory(ts->new_dir, &opt); + + /* If we need to write a backup_manifest, prepare to do so. */ + if (!opt.dry_run && !opt.no_manifest) + { + mwriter = create_manifest_writer(opt.output); + + /* + * Verify that we have a backup manifest for the final backup; else we + * won't have the WAL ranges for the resulting manifest. + */ + if (manifests[n_prior_backups] == NULL) + pg_fatal("can't generate a manifest because no manifest is available for the final input backup"); + } + else + mwriter = NULL; + + /* Write backup label into output directory. */ + if (opt.dry_run) + pg_log_debug("would generate \"%s/backup_label\"", opt.output); + else + { + pg_log_debug("generating \"%s/backup_label\"", opt.output); + last_backup_label->cursor = 0; + write_backup_label(opt.output, last_backup_label, + opt.manifest_checksums, mwriter); + } + + /* Process everything that's not part of a user-defined tablespace. */ + pg_log_debug("processing backup directory \"%s\"", last_input_dir); + process_directory_recursively(InvalidOid, last_input_dir, opt.output, + NULL, n_prior_backups, prior_backup_dirs, + manifests, mwriter, &opt); + + /* Process user-defined tablespaces. */ + for (ts = tablespaces; ts != NULL; ts = ts->next) + { + pg_log_debug("processing tablespace directory \"%s\"", ts->old_dir); + + /* + * If it's a normal tablespace, we need to set up a symbolic link from + * pg_tblspc/${OID} to the target directory; if it's an in-place + * tablespace, we need to create a directory at pg_tblspc/${OID}. + */ + if (!ts->in_place) + { + char linkpath[MAXPGPATH]; + + snprintf(linkpath, MAXPGPATH, "%s/pg_tblspc/%u", opt.output, + ts->oid); + + if (opt.dry_run) + pg_log_debug("would create symbolic link from \"%s\" to \"%s\"", + linkpath, ts->new_dir); + else + { + pg_log_debug("creating symbolic link from \"%s\" to \"%s\"", + linkpath, ts->new_dir); + if (symlink(ts->new_dir, linkpath) != 0) + pg_fatal("could not create symbolic link from \"%s\" to \"%s\": %m", + linkpath, ts->new_dir); + } + } + else + { + if (opt.dry_run) + pg_log_debug("would create directory \"%s\"", ts->new_dir); + else + { + pg_log_debug("creating directory \"%s\"", ts->new_dir); + if (pg_mkdir_p(ts->new_dir, pg_dir_create_mode) == -1) + pg_fatal("could not create directory \"%s\": %m", + ts->new_dir); + } + } + + /* OK, now handle the directory contents. */ + process_directory_recursively(ts->oid, ts->old_dir, ts->new_dir, + NULL, n_prior_backups, prior_backup_dirs, + manifests, mwriter, &opt); + } + + /* Finalize the backup_manifest, if we're generating one. */ + if (mwriter != NULL) + finalize_manifest(mwriter, + manifests[n_prior_backups]->first_wal_range); + + /* fsync that output directory unless we've been told not to do so */ + if (!opt.no_sync) + { + if (opt.dry_run) + pg_log_debug("would recursively fsync \"%s\"", opt.output); + else + { + pg_log_debug("recursively fsyncing \"%s\"", opt.output); + sync_pgdata(opt.output, version * 10000, opt.sync_method); + } + } + + /* It's a success, so don't remove the output directories. */ + reset_directory_cleanup_list(); + exit(0); +} + +/* + * Process the option argument for the -T, --tablespace-mapping switch. + */ +static void +add_tablespace_mapping(cb_options *opt, char *arg) +{ + cb_tablespace_mapping *tsmap = pg_malloc0(sizeof(cb_tablespace_mapping)); + char *dst; + char *dst_ptr; + char *arg_ptr; + + /* + * Basically, we just want to copy everything before the equals sign to + * tsmap->old_dir and everything afterwards to tsmap->new_dir, but if + * there's more or less than one equals sign, that's an error, and if + * there's an equals sign preceded by a backslash, don't treat it as a + * field separator but instead copy a literal equals sign. + */ + dst_ptr = dst = tsmap->old_dir; + for (arg_ptr = arg; *arg_ptr != '\0'; arg_ptr++) + { + if (dst_ptr - dst >= MAXPGPATH) + pg_fatal("directory name too long"); + + if (*arg_ptr == '\\' && *(arg_ptr + 1) == '=') + ; /* skip backslash escaping = */ + else if (*arg_ptr == '=' && (arg_ptr == arg || *(arg_ptr - 1) != '\\')) + { + if (tsmap->new_dir[0] != '\0') + pg_fatal("multiple \"=\" signs in tablespace mapping"); + else + dst = dst_ptr = tsmap->new_dir; + } + else + *dst_ptr++ = *arg_ptr; + } + if (!tsmap->old_dir[0] || !tsmap->new_dir[0]) + pg_fatal("invalid tablespace mapping format \"%s\", must be \"OLDDIR=NEWDIR\"", arg); + + /* + * All tablespaces are created with absolute directories, so specifying a + * non-absolute path here would never match, possibly confusing users. + * + * In contrast to pg_basebackup, both the old and new directories are on + * the local machine, so the local machine's definition of an absolute + * path is the only relevant one. + */ + if (!is_absolute_path(tsmap->old_dir)) + pg_fatal("old directory is not an absolute path in tablespace mapping: %s", + tsmap->old_dir); + + if (!is_absolute_path(tsmap->new_dir)) + pg_fatal("old directory is not an absolute path in tablespace mapping: %s", + tsmap->new_dir); + + /* Canonicalize paths to avoid spurious failures when comparing. */ + canonicalize_path(tsmap->old_dir); + canonicalize_path(tsmap->new_dir); + + /* Add it to the list. */ + tsmap->next = opt->tsmappings; + opt->tsmappings = tsmap; +} + +/* + * Check that the backup_label files form a coherent backup chain, and return + * the contents of the backup_label file from the latest backup. + */ +static StringInfo +check_backup_label_files(int n_backups, char **backup_dirs) +{ + StringInfo buf = makeStringInfo(); + StringInfo lastbuf = buf; + int i; + TimeLineID check_tli = 0; + XLogRecPtr check_lsn = InvalidXLogRecPtr; + + /* Try to read each backup_label file in turn, last to first. */ + for (i = n_backups - 1; i >= 0; --i) + { + char pathbuf[MAXPGPATH]; + int fd; + TimeLineID start_tli; + TimeLineID previous_tli; + XLogRecPtr start_lsn; + XLogRecPtr previous_lsn; + + /* Open the backup_label file. */ + snprintf(pathbuf, MAXPGPATH, "%s/backup_label", backup_dirs[i]); + pg_log_debug("reading \"%s\"", pathbuf); + if ((fd = open(pathbuf, O_RDONLY, 0)) < 0) + pg_fatal("could not open file \"%s\": %m", pathbuf); + + /* + * Slurp the whole file into memory. + * + * The exact size limit that we impose here doesn't really matter -- + * most of what's supposed to be in the file is fixed size and quite + * short. However, the length of the backup_label is limited (at least + * by some parts of the code) to MAXGPATH, so include that value in + * the maximum length that we tolerate. + */ + slurp_file(fd, pathbuf, buf, 10000 + MAXPGPATH); + + /* Close the file. */ + if (close(fd) != 0) + pg_fatal("could not close \"%s\": %m", pathbuf); + + /* Parse the file contents. */ + parse_backup_label(pathbuf, buf, &start_tli, &start_lsn, + &previous_tli, &previous_lsn); + + /* + * Sanity checks. + * + * XXX. It's actually not required that start_lsn == check_lsn. It + * would be OK if start_lsn > check_lsn provided that start_lsn is + * less than or equal to the relevant switchpoint. But at the moment + * we don't have that information. + */ + if (i > 0 && previous_tli == 0) + pg_fatal("backup at \"%s\" is a full backup, but only the first backup should be a full backup", + backup_dirs[i]); + if (i == 0 && previous_tli != 0) + pg_fatal("backup at \"%s\" is an incremental backup, but the first backup should be a full backup", + backup_dirs[i]); + if (i < n_backups - 1 && start_tli != check_tli) + pg_fatal("backup at \"%s\" starts on timeline %u, but expected %u", + backup_dirs[i], start_tli, check_tli); + if (i < n_backups - 1 && start_lsn != check_lsn) + pg_fatal("backup at \"%s\" starts at LSN %X/%X, but expected %X/%X", + backup_dirs[i], + LSN_FORMAT_ARGS(start_lsn), + LSN_FORMAT_ARGS(check_lsn)); + check_tli = previous_tli; + check_lsn = previous_lsn; + + /* + * The last backup label in the chain needs to be saved for later use, + * while the others are only needed within this loop. + */ + if (lastbuf == buf) + buf = makeStringInfo(); + else + resetStringInfo(buf); + } + + /* Free memory that we don't need any more. */ + if (lastbuf != buf) + { + pfree(buf->data); + pfree(buf); + } + + /* + * Return the data from the first backup_info that we read (which is the + * backup_label from the last directory specified on the command line). + */ + return lastbuf; +} + +/* + * Sanity check control files. + */ +static void +check_control_files(int n_backups, char **backup_dirs) +{ + int i; + uint64 system_identifier = 0; /* placate compiler */ + + /* Try to read each control file in turn, last to first. */ + for (i = n_backups - 1; i >= 0; --i) + { + ControlFileData *control_file; + bool crc_ok; + char *controlpath; + + controlpath = psprintf("%s/%s", backup_dirs[i], "global/pg_control"); + pg_log_debug("reading \"%s\"", controlpath); + control_file = get_controlfile(backup_dirs[i], &crc_ok); + + /* Control file contents not meaningful if CRC is bad. */ + if (!crc_ok) + pg_fatal("%s: crc is incorrect", controlpath); + + /* Can't interpret control file if not current version. */ + if (control_file->pg_control_version != PG_CONTROL_VERSION) + pg_fatal("%s: unexpected control file version", + controlpath); + + /* System identifiers should all match. */ + if (i == n_backups - 1) + system_identifier = control_file->system_identifier; + else if (system_identifier != control_file->system_identifier) + pg_fatal("%s: expected system identifier %llu, but found %llu", + controlpath, (unsigned long long) system_identifier, + (unsigned long long) control_file->system_identifier); + + /* Release memory. */ + pfree(control_file); + pfree(controlpath); + } + + /* + * If debug output is enabled, make a note of the system identifier that + * we found in all of the relevant control files. + */ + pg_log_debug("system identifier is %llu", + (unsigned long long) system_identifier); +} + +/* + * Set default permissions for new files and directories based on the + * permissions of the given directory. The intent here is that the output + * directory should use the same permissions scheme as the final input + * directory. + */ +static void +check_input_dir_permissions(char *dir) +{ + struct stat st; + + if (stat(dir, &st) != 0) + pg_fatal("could not stat \"%s\": %m", dir); + + SetDataDirectoryCreatePerm(st.st_mode); +} + +/* + * Clean up output directories before exiting. + */ +static void +cleanup_directories_atexit(void) +{ + while (cleanup_dir_list != NULL) + { + cb_cleanup_dir *dir = cleanup_dir_list; + + if (dir->rmtopdir) + { + pg_log_info("removing output directory \"%s\"", dir->target_path); + if (!rmtree(dir->target_path, dir->rmtopdir)) + pg_log_error("failed to remove output directory"); + } + else + { + pg_log_info("removing contents of output directory \"%s\"", + dir->target_path); + if (!rmtree(dir->target_path, dir->rmtopdir)) + pg_log_error("failed to remove contents of output directory"); + } + + cleanup_dir_list = cleanup_dir_list->next; + pfree(dir); + } +} + +/* + * Create the named output directory, unless it already exists or we're in + * dry-run mode. If it already exists but is not empty, that's a fatal error. + * + * Adds the created directory to the list of directories to be cleaned up + * at process exit. + */ +static void +create_output_directory(char *dirname, cb_options *opt) +{ + switch (pg_check_dir(dirname)) + { + case 0: + if (opt->dry_run) + { + pg_log_debug("would create directory \"%s\"", dirname); + return; + } + pg_log_debug("creating directory \"%s\"", dirname); + if (pg_mkdir_p(dirname, pg_dir_create_mode) == -1) + pg_fatal("could not create directory \"%s\": %m", dirname); + remember_to_cleanup_directory(dirname, true); + break; + + case 1: + pg_log_debug("using existing directory \"%s\"", dirname); + remember_to_cleanup_directory(dirname, false); + break; + + case 2: + case 3: + case 4: + pg_fatal("directory \"%s\" exists but is not empty", dirname); + + case -1: + pg_fatal("could not access directory \"%s\": %m", dirname); + } +} + +/* + * help + * + * Prints help page for the program + * + * progname: the name of the executed program, such as "pg_combinebackup" + */ +static void +help(const char *progname) +{ + printf(_("%s reconstructs full backups from incrementals.\n\n"), progname); + printf(_("Usage:\n")); + printf(_(" %s [OPTION]... DIRECTORY...\n"), progname); + printf(_("\nOptions:\n")); + printf(_(" -d, --debug generate lots of debugging output\n")); + printf(_(" -n, --dry-run don't actually do anything\n")); + printf(_(" -N, --no-sync do not wait for changes to be written safely to disk\n")); + printf(_(" -o, --output output directory\n")); + printf(_(" -T, --tablespace-mapping=OLDDIR=NEWDIR\n")); + printf(_(" relocate tablespace in OLDDIR to NEWDIR\n")); + printf(_(" --manifest-checksums=SHA{224,256,384,512}|CRC32C|NONE\n" + " use algorithm for manifest checksums\n")); + printf(_(" --no-manifest suppress generation of backup manifest\n")); + printf(_(" --sync-method=METHOD set method for syncing files to disk\n")); + printf(_(" -?, --help show this help, then exit\n")); + + printf(_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT); + printf(_("%s home page: <%s>\n"), PACKAGE_NAME, PACKAGE_URL); +} + +/* + * Try to parse a string as a non-zero OID without leading zeroes. + * + * If it works, return true and set *result to the answer, else return false. + */ +static bool +parse_oid(char *s, Oid *result) +{ + Oid oid; + char *ep; + + errno = 0; + oid = strtoul(s, &ep, 10); + if (errno != 0 || *ep != '\0' || oid < 1 || oid > PG_UINT32_MAX) + return false; + + *result = oid; + return true; +} + +/* + * Copy files from the input directory to the output directory, reconstructing + * full files from incremental files as required. + * + * If processing is a user-defined tablespace, the tsoid should be the OID + * of that tablespace and input_directory and output_directory should be the + * toplevel input and output directories for that tablespace. Otherwise, + * tsoid should be InvalidOid and input_directory and output_directory should + * be the main input and output directories. + * + * relative_path is the path beneath the given input and output directories + * that we are currently processing. If NULL, it indicates that we're + * processing the input and output directories themselves. + * + * n_prior_backups is the number of prior backups that we have available. + * This doesn't count the very last backup, which is referenced by + * output_directory, just the older ones. prior_backup_dirs is an array of + * the locations of those previous backups. + */ +static void +process_directory_recursively(Oid tsoid, + char *input_directory, + char *output_directory, + char *relative_path, + int n_prior_backups, + char **prior_backup_dirs, + manifest_data **manifests, + manifest_writer *mwriter, + cb_options *opt) +{ + char ifulldir[MAXPGPATH]; + char ofulldir[MAXPGPATH]; + char manifest_prefix[MAXPGPATH]; + DIR *dir; + struct dirent *de; + bool is_pg_tblspc; + bool is_pg_wal; + manifest_data *latest_manifest = manifests[n_prior_backups]; + pg_checksum_type checksum_type; + + /* + * pg_tblspc and pg_wal are special cases, so detect those here. + * + * pg_tblspc is only special at the top level, but subdirectories of + * pg_wal are just as special as the top level directory. + * + * Since incremental backup does not exist in pre-v10 versions, we don't + * have to worry about the old pg_xlog naming. + */ + is_pg_tblspc = !OidIsValid(tsoid) && relative_path != NULL && + strcmp(relative_path, "pg_tblspc") == 0; + is_pg_wal = !OidIsValid(tsoid) && relative_path != NULL && + (strcmp(relative_path, "pg_wal") == 0 || + strncmp(relative_path, "pg_wal/", 7) == 0); + + /* + * If we're under pg_wal, then we don't need checksums, because these + * files aren't included in the backup manifest. Otherwise use whatever + * type of checksum is configured. + */ + if (!is_pg_wal) + checksum_type = opt->manifest_checksums; + else + checksum_type = CHECKSUM_TYPE_NONE; + + /* + * Append the relative path to the input and output directories, and + * figure out the appropriate prefix to add to files in this directory + * when looking them up in a backup manifest. + */ + if (relative_path == NULL) + { + strncpy(ifulldir, input_directory, MAXPGPATH); + strncpy(ofulldir, output_directory, MAXPGPATH); + if (OidIsValid(tsoid)) + snprintf(manifest_prefix, MAXPGPATH, "pg_tblspc/%u/", tsoid); + else + manifest_prefix[0] = '\0'; + } + else + { + snprintf(ifulldir, MAXPGPATH, "%s/%s", input_directory, + relative_path); + snprintf(ofulldir, MAXPGPATH, "%s/%s", output_directory, + relative_path); + if (OidIsValid(tsoid)) + snprintf(manifest_prefix, MAXPGPATH, "pg_tblspc/%u/%s/", + tsoid, relative_path); + else + snprintf(manifest_prefix, MAXPGPATH, "%s/", relative_path); + } + + /* + * Toplevel output directories have already been created by the time this + * function is called, but any subdirectories are our responsibility. + */ + if (relative_path != NULL) + { + if (opt->dry_run) + pg_log_debug("would create directory \"%s\"", ofulldir); + else + { + pg_log_debug("creating directory \"%s\"", ofulldir); + if (mkdir(ofulldir, pg_dir_create_mode) == -1) + pg_fatal("could not create directory \"%s\": %m", ofulldir); + } + } + + /* It's time to scan the directory. */ + if ((dir = opendir(ifulldir)) == NULL) + pg_fatal("could not open directory \"%s\": %m", ifulldir); + while (errno = 0, (de = readdir(dir)) != NULL) + { + PGFileType type; + char ifullpath[MAXPGPATH]; + char ofullpath[MAXPGPATH]; + char manifest_path[MAXPGPATH]; + Oid oid = InvalidOid; + int checksum_length = 0; + uint8 *checksum_payload = NULL; + pg_checksum_context checksum_ctx; + + /* Ignore "." and ".." entries. */ + if (strcmp(de->d_name, ".") == 0 || + strcmp(de->d_name, "..") == 0) + continue; + + /* Construct input path. */ + snprintf(ifullpath, MAXPGPATH, "%s/%s", ifulldir, de->d_name); + + /* Figure out what kind of directory entry this is. */ + type = get_dirent_type(ifullpath, de, false, PG_LOG_ERROR); + if (type == PGFILETYPE_ERROR) + exit(1); + + /* + * If we're processing pg_tblspc, then check whether the filename + * looks like it could be a tablespace OID. If so, and if the + * directory entry is a symbolic link or a directory, skip it. + * + * Our goal here is to ignore anything that would have been considered + * by scan_for_existing_tablespaces to be a tablespace. + */ + if (is_pg_tblspc && parse_oid(de->d_name, &oid) && + (type == PGFILETYPE_LNK || type == PGFILETYPE_DIR)) + continue; + + /* If it's a directory, recurse. */ + if (type == PGFILETYPE_DIR) + { + char new_relative_path[MAXPGPATH]; + + /* Append new pathname component to relative path. */ + if (relative_path == NULL) + strncpy(new_relative_path, de->d_name, MAXPGPATH); + else + snprintf(new_relative_path, MAXPGPATH, "%s/%s", relative_path, + de->d_name); + + /* And recurse. */ + process_directory_recursively(tsoid, + input_directory, output_directory, + new_relative_path, + n_prior_backups, prior_backup_dirs, + manifests, mwriter, opt); + continue; + } + + /* Skip anything that's not a regular file. */ + if (type != PGFILETYPE_REG) + { + if (type == PGFILETYPE_LNK) + pg_log_warning("skipping symbolic link \"%s\"", ifullpath); + else + pg_log_warning("skipping special file \"%s\"", ifullpath); + continue; + } + + /* + * Skip the backup_label and backup_manifest files; they require + * special handling and are handled elsewhere. + */ + if (relative_path == NULL && + (strcmp(de->d_name, "backup_label") == 0 || + strcmp(de->d_name, "backup_manifest") == 0)) + continue; + + /* + * If it's an incremental file, hand it off to the reconstruction + * code, which will figure out what to do. + */ + if (strncmp(de->d_name, INCREMENTAL_PREFIX, + INCREMENTAL_PREFIX_LENGTH) == 0) + { + /* Output path should not include "INCREMENTAL." prefix. */ + snprintf(ofullpath, MAXPGPATH, "%s/%s", ofulldir, + de->d_name + INCREMENTAL_PREFIX_LENGTH); + + + /* Manifest path likewise omits incremental prefix. */ + snprintf(manifest_path, MAXPGPATH, "%s%s", manifest_prefix, + de->d_name + INCREMENTAL_PREFIX_LENGTH); + + /* Reconstruction logic will do the rest. */ + reconstruct_from_incremental_file(ifullpath, ofullpath, + relative_path, + de->d_name + INCREMENTAL_PREFIX_LENGTH, + n_prior_backups, + prior_backup_dirs, + manifests, + manifest_path, + checksum_type, + &checksum_length, + &checksum_payload, + opt->debug, + opt->dry_run); + } + else + { + /* Construct the path that the backup_manifest will use. */ + snprintf(manifest_path, MAXPGPATH, "%s%s", manifest_prefix, + de->d_name); + + /* + * It's not an incremental file, so we need to copy the entire + * file to the output directory. + * + * If a checksum of the required type already exists in the + * backup_manifest for the final input directory, we can save some + * work by reusing that checksum instead of computing a new one. + */ + if (checksum_type != CHECKSUM_TYPE_NONE && + latest_manifest != NULL) + { + manifest_file *mfile; + + mfile = manifest_files_lookup(latest_manifest->files, + manifest_path); + if (mfile == NULL) + { + char *bmpath; + + /* + * The directory is out of sync with the backup_manifest, + * so emit a warning. + */ + bmpath = psprintf("%s/%s", input_directory, + "backup_manifest"); + pg_log_warning("\"%s\" contains no entry for \"%s\"", + bmpath, manifest_path); + pfree(bmpath); + } + else if (mfile->checksum_type == checksum_type) + { + checksum_length = mfile->checksum_length; + checksum_payload = mfile->checksum_payload; + } + } + + /* + * If we're reusing a checksum, then we don't need copy_file() to + * compute one for us, but otherwise, it needs to compute whatever + * type of checksum we need. + */ + if (checksum_length != 0) + pg_checksum_init(&checksum_ctx, CHECKSUM_TYPE_NONE); + else + pg_checksum_init(&checksum_ctx, checksum_type); + + /* Actually copy the file. */ + snprintf(ofullpath, MAXPGPATH, "%s/%s", ofulldir, de->d_name); + copy_file(ifullpath, ofullpath, &checksum_ctx, opt->dry_run); + + /* + * If copy_file() performed a checksum calculation for us, then + * save the results (except in dry-run mode, when there's no + * point). + */ + if (checksum_ctx.type != CHECKSUM_TYPE_NONE && !opt->dry_run) + { + checksum_payload = pg_malloc(PG_CHECKSUM_MAX_LENGTH); + checksum_length = pg_checksum_final(&checksum_ctx, + checksum_payload); + } + } + + /* Generate manifest entry, if needed. */ + if (mwriter != NULL) + { + struct stat sb; + + /* + * In order to generate a manifest entry, we need the file size + * and mtime. We have no way to know the correct mtime except to + * stat() the file, so just do that and get the size as well. + * + * If we didn't need the mtime here, we could try to obtain the + * file size from the reconstruction or file copy process above, + * although that is actually not convenient in all cases. If we + * write the file ourselves then clearly we can keep a count of + * bytes, but if we use something like CopyFile() then it's + * trickier. Since we have to stat() anyway to get the mtime, + * there's no point in worrying about it. + */ + if (stat(ofullpath, &sb) < 0) + pg_fatal("could not stat file \"%s\": %m", ofullpath); + + /* OK, now do the work. */ + add_file_to_manifest(mwriter, manifest_path, + sb.st_size, sb.st_mtime, + checksum_type, checksum_length, + checksum_payload); + } + + /* Avoid leaking memory. */ + if (checksum_payload != NULL) + pfree(checksum_payload); + } + + closedir(dir); +} + +/* + * Read the version number from PG_VERSION and convert it to the usual server + * version number format. (e.g. If PG_VERSION contains "14\n" this function + * will return 140000) + */ +static int +read_pg_version_file(char *directory) +{ + char filename[MAXPGPATH]; + StringInfoData buf; + int fd; + int version; + char *ep; + + /* Construct pathname. */ + snprintf(filename, MAXPGPATH, "%s/PG_VERSION", directory); + + /* Open file. */ + if ((fd = open(filename, O_RDONLY, 0)) < 0) + pg_fatal("could not open file \"%s\": %m", filename); + + /* Read into memory. Length limit of 128 should be more than generous. */ + initStringInfo(&buf); + slurp_file(fd, filename, &buf, 128); + + /* Close the file. */ + if (close(fd) != 0) + pg_fatal("could not close \"%s\": %m", filename); + + /* Convert to integer. */ + errno = 0; + version = strtoul(buf.data, &ep, 10); + if (errno != 0 || *ep != '\n') + { + /* + * Incremental backup is not relevant to very old server versions that + * used multi-part version number (e.g. 9.6, or 8.4). So if we see + * what looks like the beginning of such a version number, just bail + * out. + */ + if (version < 10 && *ep == '.') + pg_fatal("%s: server version too old\n", filename); + pg_fatal("%s: could not parse version number\n", filename); + } + + /* Debugging output. */ + pg_log_debug("read server version %d from \"%s\"", version, filename); + + /* Release memory and return result. */ + pfree(buf.data); + return version * 10000; +} + +/* + * Add a directory to the list of output directories to clean up. + */ +static void +remember_to_cleanup_directory(char *target_path, bool rmtopdir) +{ + cb_cleanup_dir *dir = pg_malloc(sizeof(cb_cleanup_dir)); + + dir->target_path = target_path; + dir->rmtopdir = rmtopdir; + dir->next = cleanup_dir_list; + cleanup_dir_list = dir; +} + +/* + * Empty out the list of directories scheduled for cleanup a exit. + * + * We want to remove the output directories only on a failure, so call this + * function when we know that the operation has succeeded. + * + * Since we only expect this to be called when we're about to exit, we could + * just set cleanup_dir_list to NULL and be done with it, but we free the + * memory to be tidy. + */ +static void +reset_directory_cleanup_list(void) +{ + while (cleanup_dir_list != NULL) + { + cb_cleanup_dir *dir = cleanup_dir_list; + + cleanup_dir_list = cleanup_dir_list->next; + pfree(dir); + } +} + +/* + * Scan the pg_tblspc directory of the final input backup to get a canonical + * list of what tablespaces are part of the backup. + * + * 'pathname' should be the path to the toplevel backup directory for the + * final backup in the backup chain. + */ +static cb_tablespace * +scan_for_existing_tablespaces(char *pathname, cb_options *opt) +{ + char pg_tblspc[MAXPGPATH]; + DIR *dir; + struct dirent *de; + cb_tablespace *tslist = NULL; + + snprintf(pg_tblspc, MAXPGPATH, "%s/pg_tblspc", pathname); + pg_log_debug("scanning \"%s\"", pg_tblspc); + + if ((dir = opendir(pg_tblspc)) == NULL) + pg_fatal("could not open directory \"%s\": %m", pathname); + + while (errno = 0, (de = readdir(dir)) != NULL) + { + Oid oid; + char tblspcdir[MAXPGPATH]; + char link_target[MAXPGPATH]; + int link_length; + cb_tablespace *ts; + cb_tablespace *otherts; + PGFileType type; + + /* Silently ignore "." and ".." entries. */ + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + + /* Construct full pathname. */ + snprintf(tblspcdir, MAXPGPATH, "%s/%s", pg_tblspc, de->d_name); + + /* Ignore any file name that doesn't look like a proper OID. */ + if (!parse_oid(de->d_name, &oid)) + { + pg_log_debug("skipping \"%s\" because the filename is not a legal tablespace OID", + tblspcdir); + continue; + } + + /* Only symbolic links and directories are tablespaces. */ + type = get_dirent_type(tblspcdir, de, false, PG_LOG_ERROR); + if (type == PGFILETYPE_ERROR) + exit(1); + if (type != PGFILETYPE_LNK && type != PGFILETYPE_DIR) + { + pg_log_debug("skipping \"%s\" because it is neither a symbolic link nor a directory", + tblspcdir); + continue; + } + + /* Create a new tablespace object. */ + ts = pg_malloc0(sizeof(cb_tablespace)); + ts->oid = oid; + + /* + * If it's a link, it's not an in-place tablespace. Otherwise, it must + * be a directory, and thus an in-place tablespace. + */ + if (type == PGFILETYPE_LNK) + { + cb_tablespace_mapping *tsmap; + + /* Read the link target. */ + link_length = readlink(tblspcdir, link_target, sizeof(link_target)); + if (link_length < 0) + pg_fatal("could not read symbolic link \"%s\": %m", + tblspcdir); + if (link_length >= sizeof(link_target)) + pg_fatal("symbolic link \"%s\" is too long", tblspcdir); + link_target[link_length] = '\0'; + if (!is_absolute_path(link_target)) + pg_fatal("symbolic link \"%s\" is relative", tblspcdir); + + /* Caonicalize the link target. */ + canonicalize_path(link_target); + + /* + * Find the corresponding tablespace mapping and copy the relevant + * details into the new tablespace entry. + */ + for (tsmap = opt->tsmappings; tsmap != NULL; tsmap = tsmap->next) + { + if (strcmp(tsmap->old_dir, link_target) == 0) + { + strncpy(ts->old_dir, tsmap->old_dir, MAXPGPATH); + strncpy(ts->new_dir, tsmap->new_dir, MAXPGPATH); + ts->in_place = false; + break; + } + } + + /* Every non-in-place tablespace must be mapped. */ + if (tsmap == NULL) + pg_fatal("tablespace at \"%s\" has no tablespace mapping", + link_target); + } + else + { + /* + * For an in-place tablespace, there's no separate directory, so + * we just record the paths within the data directories. + */ + snprintf(ts->old_dir, MAXPGPATH, "%s/%s", pg_tblspc, de->d_name); + snprintf(ts->new_dir, MAXPGPATH, "%s/pg_tblpc/%s", opt->output, + de->d_name); + ts->in_place = true; + } + + /* Tablespaces should not share a directory. */ + for (otherts = tslist; otherts != NULL; otherts = otherts->next) + if (strcmp(ts->new_dir, otherts->new_dir) == 0) + pg_fatal("tablespaces with OIDs %u and %u both point at \"%s\"", + otherts->oid, oid, ts->new_dir); + + /* Add this tablespace to the list. */ + ts->next = tslist; + tslist = ts; + } + + return tslist; +} + +/* + * Read a file into a StringInfo. + * + * fd is used for the actual file I/O, filename for error reporting purposes. + * A file longer than maxlen is a fatal error. + */ +static void +slurp_file(int fd, char *filename, StringInfo buf, int maxlen) +{ + struct stat st; + ssize_t rb; + + /* Check file size, and complain if it's too large. */ + if (fstat(fd, &st) != 0) + pg_fatal("could not stat \"%s\": %m", filename); + if (st.st_size > maxlen) + pg_fatal("file \"%s\" is too large", filename); + + /* Make sure we have enough space. */ + enlargeStringInfo(buf, st.st_size); + + /* Read the data. */ + rb = read(fd, &buf->data[buf->len], st.st_size); + + /* + * We don't expect any concurrent changes, so we should read exactly the + * expected number of bytes. + */ + if (rb != st.st_size) + { + if (rb < 0) + pg_fatal("could not read file \"%s\": %m", filename); + else + pg_fatal("could not read file \"%s\": read only %d of %d bytes", + filename, (int) rb, (int) st.st_size); + } + + /* Adjust buffer length for new data and restore trailing-\0 invariant */ + buf->len += rb; + buf->data[buf->len] = '\0'; +} diff --git a/src/bin/pg_combinebackup/reconstruct.c b/src/bin/pg_combinebackup/reconstruct.c new file mode 100644 index 0000000000000..6decdd8934039 --- /dev/null +++ b/src/bin/pg_combinebackup/reconstruct.c @@ -0,0 +1,687 @@ +/*------------------------------------------------------------------------- + * + * reconstruct.c + * Reconstruct full file from incremental file and backup chain. + * + * Copyright (c) 2017-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/bin/pg_combinebackup/reconstruct.c + * + *------------------------------------------------------------------------- + */ +#include "postgres_fe.h" + +#include + +#include "backup/basebackup_incremental.h" +#include "common/logging.h" +#include "common/file_perm.h" +#include "copy_file.h" +#include "lib/stringinfo.h" +#include "reconstruct.h" +#include "storage/block.h" + +/* + * An rfile stores the data that we need in order to be able to use some file + * on disk for reconstruction. For any given output file, we create one rfile + * per backup that we need to consult when we constructing that output file. + * + * If we find a full version of the file in the backup chain, then only + * filename and fd are initialized; the remaining fields are 0 or NULL. + * For an incremental file, header_length, num_blocks, relative_block_numbers, + * and truncation_block_length are also set. + * + * num_blocks_read and highest_offset_read always start out as 0. + */ +typedef struct rfile +{ + char *filename; + int fd; + size_t header_length; + unsigned num_blocks; + BlockNumber *relative_block_numbers; + unsigned truncation_block_length; + unsigned num_blocks_read; + off_t highest_offset_read; +} rfile; + +static void debug_reconstruction(int n_source, + rfile **sources, + bool dry_run); +static unsigned find_reconstructed_block_length(rfile *s); +static rfile *make_incremental_rfile(char *filename); +static rfile *make_rfile(char *filename, bool missing_ok); +static void write_reconstructed_file(char *input_filename, + char *output_filename, + unsigned block_length, + rfile **sourcemap, + off_t *offsetmap, + pg_checksum_context *checksum_ctx, + bool debug, + bool dry_run); +static void read_bytes(rfile *rf, void *buffer, unsigned length); + +/* + * Reconstruct a full file from an incremental file and a chain of prior + * backups. + * + * input_filename should be the path to the incremental file, and + * output_filename should be the path where the reconstructed file is to be + * written. + * + * relative_path should be the relative path to the directory containing this + * file. bare_file_name should be the name of the file within that directory, + * without "INCREMENTAL.". + * + * n_prior_backups is the number of prior backups, and prior_backup_dirs is + * an array of pathnames where those backups can be found. + */ +void +reconstruct_from_incremental_file(char *input_filename, + char *output_filename, + char *relative_path, + char *bare_file_name, + int n_prior_backups, + char **prior_backup_dirs, + manifest_data **manifests, + char *manifest_path, + pg_checksum_type checksum_type, + int *checksum_length, + uint8 **checksum_payload, + bool debug, + bool dry_run) +{ + rfile **source; + rfile *latest_source = NULL; + rfile **sourcemap; + off_t *offsetmap; + unsigned block_length; + unsigned i; + unsigned sidx = n_prior_backups; + bool full_copy_possible = true; + int copy_source_index = -1; + rfile *copy_source = NULL; + pg_checksum_context checksum_ctx; + + /* + * Every block must come either from the latest version of the file or + * from one of the prior backups. + */ + source = pg_malloc0(sizeof(rfile *) * (1 + n_prior_backups)); + + /* + * Use the information from the latest incremental file to figure out how + * long the reconstructed file should be. + */ + latest_source = make_incremental_rfile(input_filename); + source[n_prior_backups] = latest_source; + block_length = find_reconstructed_block_length(latest_source); + + /* + * For each block in the output file, we need to know from which file we + * need to obtain it and at what offset in that file it's stored. + * sourcemap gives us the first of these things, and offsetmap the latter. + */ + sourcemap = pg_malloc0(sizeof(rfile *) * block_length); + offsetmap = pg_malloc0(sizeof(off_t) * block_length); + + /* + * Every block that is present in the newest incremental file should be + * sourced from that file. If it precedes the truncation_block_length, + * it's a block that we would otherwise have had to find in an older + * backup and thus reduces the number of blocks remaining to be found by + * one; otherwise, it's an extra block that needs to be included in the + * output but would not have needed to be found in an older backup if it + * had not been present. + */ + for (i = 0; i < latest_source->num_blocks; ++i) + { + BlockNumber b = latest_source->relative_block_numbers[i]; + + Assert(b < block_length); + sourcemap[b] = latest_source; + offsetmap[b] = latest_source->header_length + (i * BLCKSZ); + + /* + * A full copy of a file from an earlier backup is only possible if no + * blocks are needed from any later incremental file. + */ + full_copy_possible = false; + } + + while (1) + { + char source_filename[MAXPGPATH]; + rfile *s; + + /* + * Move to the next backup in the chain. If there are no more, then + * we're done. + */ + if (sidx == 0) + break; + --sidx; + + /* + * Look for the full file in the previous backup. If not found, then + * look for an incremental file instead. + */ + snprintf(source_filename, MAXPGPATH, "%s/%s/%s", + prior_backup_dirs[sidx], relative_path, bare_file_name); + if ((s = make_rfile(source_filename, true)) == NULL) + { + snprintf(source_filename, MAXPGPATH, "%s/%s/INCREMENTAL.%s", + prior_backup_dirs[sidx], relative_path, bare_file_name); + s = make_incremental_rfile(source_filename); + } + source[sidx] = s; + + /* + * If s->header_length == 0, then this is a full file; otherwise, it's + * an incremental file. + */ + if (s->header_length == 0) + { + struct stat sb; + BlockNumber b; + BlockNumber blocklength; + + /* We need to know the length of the file. */ + if (fstat(s->fd, &sb) < 0) + pg_fatal("could not stat \"%s\": %m", s->filename); + + /* + * Since we found a full file, source all blocks from it that + * exist in the file. + * + * Note that there may be blocks that don't exist either in this + * file or in any incremental file but that precede + * truncation_block_length. These are, presumably, zero-filled + * blocks that result from the server extending the file but + * taking no action on those blocks that generated any WAL. + * + * Sadly, we have no way of validating that this is really what + * happened, and neither does the server. From it's perspective, + * an unmodified block that contains data looks exactly the same + * as a zero-filled block that never had any data: either way, + * it's not mentioned in any WAL summary and the server has no + * reason to read it. From our perspective, all we know is that + * nobody had a reason to back up the block. That certainly means + * that the block didn't exist at the time of the full backup, but + * the supposition that it was all zeroes at the time of every + * later backup is one that we can't validate. + */ + blocklength = sb.st_size / BLCKSZ; + for (b = 0; b < latest_source->truncation_block_length; ++b) + { + if (sourcemap[b] == NULL && b < blocklength) + { + sourcemap[b] = s; + offsetmap[b] = b * BLCKSZ; + } + } + + /* + * If a full copy looks possible, check whether the resulting file + * should be exactly as long as the source file is. If so, a full + * copy is acceptable, otherwise not. + */ + if (full_copy_possible) + { + uint64 expected_length; + + expected_length = + (uint64) latest_source->truncation_block_length; + expected_length *= BLCKSZ; + if (expected_length == sb.st_size) + { + copy_source = s; + copy_source_index = sidx; + } + } + + /* We don't need to consider any further sources. */ + break; + } + + /* + * Since we found another incremental file, source all blocks from it + * that we need but don't yet have. + */ + for (i = 0; i < s->num_blocks; ++i) + { + BlockNumber b = s->relative_block_numbers[i]; + + if (b < latest_source->truncation_block_length && + sourcemap[b] == NULL) + { + sourcemap[b] = s; + offsetmap[b] = s->header_length + (i * BLCKSZ); + + /* + * A full copy of a file from an earlier backup is only + * possible if no blocks are needed from any later incremental + * file. + */ + full_copy_possible = false; + } + } + } + + /* + * If a checksum of the required type already exists in the + * backup_manifest for the relevant input directory, we can save some work + * by reusing that checksum instead of computing a new one. + */ + if (copy_source_index >= 0 && manifests[copy_source_index] != NULL && + checksum_type != CHECKSUM_TYPE_NONE) + { + manifest_file *mfile; + + mfile = manifest_files_lookup(manifests[copy_source_index]->files, + manifest_path); + if (mfile == NULL) + { + char *path = psprintf("%s/backup_manifest", + prior_backup_dirs[copy_source_index]); + + /* + * The directory is out of sync with the backup_manifest, so emit + * a warning. + */ + /*- translator: the first %s is a backup manifest file, the second is a file absent therein */ + pg_log_warning("\"%s\" contains no entry for \"%s\"", + path, + manifest_path); + pfree(path); + } + else if (mfile->checksum_type == checksum_type) + { + *checksum_length = mfile->checksum_length; + *checksum_payload = pg_malloc(*checksum_length); + memcpy(*checksum_payload, mfile->checksum_payload, + *checksum_length); + checksum_type = CHECKSUM_TYPE_NONE; + } + } + + /* Prepare for checksum calculation, if required. */ + pg_checksum_init(&checksum_ctx, checksum_type); + + /* + * If the full file can be created by copying a file from an older backup + * in the chain without needing to overwrite any blocks or truncate the + * result, then forget about performing reconstruction and just copy that + * file in its entirety. + * + * Otherwise, reconstruct. + */ + if (copy_source != NULL) + copy_file(copy_source->filename, output_filename, + &checksum_ctx, dry_run); + else + { + write_reconstructed_file(input_filename, output_filename, + block_length, sourcemap, offsetmap, + &checksum_ctx, debug, dry_run); + debug_reconstruction(n_prior_backups + 1, source, dry_run); + } + + /* Save results of checksum calculation. */ + if (checksum_type != CHECKSUM_TYPE_NONE) + { + *checksum_payload = pg_malloc(PG_CHECKSUM_MAX_LENGTH); + *checksum_length = pg_checksum_final(&checksum_ctx, + *checksum_payload); + } + + /* + * Close files and release memory. + */ + for (i = 0; i <= n_prior_backups; ++i) + { + rfile *s = source[i]; + + if (s == NULL) + continue; + if (close(s->fd) != 0) + pg_fatal("could not close \"%s\": %m", s->filename); + if (s->relative_block_numbers != NULL) + pfree(s->relative_block_numbers); + pg_free(s->filename); + } + pfree(sourcemap); + pfree(offsetmap); + pfree(source); +} + +/* + * Perform post-reconstruction logging and sanity checks. + */ +static void +debug_reconstruction(int n_source, rfile **sources, bool dry_run) +{ + unsigned i; + + for (i = 0; i < n_source; ++i) + { + rfile *s = sources[i]; + + /* Ignore source if not used. */ + if (s == NULL) + continue; + + /* If no data is needed from this file, we can ignore it. */ + if (s->num_blocks_read == 0) + continue; + + /* Debug logging. */ + if (dry_run) + pg_log_debug("would have read %u blocks from \"%s\"", + s->num_blocks_read, s->filename); + else + pg_log_debug("read %u blocks from \"%s\"", + s->num_blocks_read, s->filename); + + /* + * In dry-run mode, we don't actually try to read data from the file, + * but we do try to verify that the file is long enough that we could + * have read the data if we'd tried. + * + * If this fails, then it means that a non-dry-run attempt would fail, + * complaining of not being able to read the required bytes from the + * file. + */ + if (dry_run) + { + struct stat sb; + + if (fstat(s->fd, &sb) < 0) + pg_fatal("could not stat \"%s\": %m", s->filename); + if (sb.st_size < s->highest_offset_read) + pg_fatal("file \"%s\" is too short: expected %llu, found %llu", + s->filename, + (unsigned long long) s->highest_offset_read, + (unsigned long long) sb.st_size); + } + } +} + +/* + * When we perform reconstruction using an incremental file, the output file + * should be at least as long as the truncation_block_length. Any blocks + * present in the incremental file increase the output length as far as is + * necessary to include those blocks. + */ +static unsigned +find_reconstructed_block_length(rfile *s) +{ + unsigned block_length = s->truncation_block_length; + unsigned i; + + for (i = 0; i < s->num_blocks; ++i) + if (s->relative_block_numbers[i] >= block_length) + block_length = s->relative_block_numbers[i] + 1; + + return block_length; +} + +/* + * Initialize an incremental rfile, reading the header so that we know which + * blocks it contains. + */ +static rfile * +make_incremental_rfile(char *filename) +{ + rfile *rf; + unsigned magic; + + rf = make_rfile(filename, false); + + /* Read and validate magic number. */ + read_bytes(rf, &magic, sizeof(magic)); + if (magic != INCREMENTAL_MAGIC) + pg_fatal("file \"%s\" has bad incremental magic number (0x%x not 0x%x)", + filename, magic, INCREMENTAL_MAGIC); + + /* Read block count. */ + read_bytes(rf, &rf->num_blocks, sizeof(rf->num_blocks)); + if (rf->num_blocks > RELSEG_SIZE) + pg_fatal("file \"%s\" has block count %u in excess of segment size %u", + filename, rf->num_blocks, RELSEG_SIZE); + + /* Read truncation block length. */ + read_bytes(rf, &rf->truncation_block_length, + sizeof(rf->truncation_block_length)); + if (rf->truncation_block_length > RELSEG_SIZE) + pg_fatal("file \"%s\" has truncation block length %u in excess of segment size %u", + filename, rf->truncation_block_length, RELSEG_SIZE); + + /* Read block numbers if there are any. */ + if (rf->num_blocks > 0) + { + rf->relative_block_numbers = + pg_malloc0(sizeof(BlockNumber) * rf->num_blocks); + read_bytes(rf, rf->relative_block_numbers, + sizeof(BlockNumber) * rf->num_blocks); + } + + /* Remember length of header. */ + rf->header_length = sizeof(magic) + sizeof(rf->num_blocks) + + sizeof(rf->truncation_block_length) + + sizeof(BlockNumber) * rf->num_blocks; + + return rf; +} + +/* + * Allocate and perform basic initialization of an rfile. + */ +static rfile * +make_rfile(char *filename, bool missing_ok) +{ + rfile *rf; + + rf = pg_malloc0(sizeof(rfile)); + rf->filename = pstrdup(filename); + if ((rf->fd = open(filename, O_RDONLY | PG_BINARY, 0)) < 0) + { + if (missing_ok && errno == ENOENT) + { + pg_free(rf); + return NULL; + } + pg_fatal("could not open file \"%s\": %m", filename); + } + + return rf; +} + +/* + * Read the indicated number of bytes from an rfile into the buffer. + */ +static void +read_bytes(rfile *rf, void *buffer, unsigned length) +{ + unsigned rb = read(rf->fd, buffer, length); + + if (rb != length) + { + if (rb < 0) + pg_fatal("could not read file \"%s\": %m", rf->filename); + else + pg_fatal("could not read file \"%s\": read only %d of %d bytes", + rf->filename, (int) rb, length); + } +} + +/* + * Write out a reconstructed file. + */ +static void +write_reconstructed_file(char *input_filename, + char *output_filename, + unsigned block_length, + rfile **sourcemap, + off_t *offsetmap, + pg_checksum_context *checksum_ctx, + bool debug, + bool dry_run) +{ + int wfd = -1; + unsigned i; + unsigned zero_blocks = 0; + + /* Debugging output. */ + if (debug) + { + StringInfoData debug_buf; + unsigned start_of_range = 0; + unsigned current_block = 0; + + /* Basic information about the output file to be produced. */ + if (dry_run) + pg_log_debug("would reconstruct \"%s\" (%u blocks, checksum %s)", + output_filename, block_length, + pg_checksum_type_name(checksum_ctx->type)); + else + pg_log_debug("reconstructing \"%s\" (%u blocks, checksum %s)", + output_filename, block_length, + pg_checksum_type_name(checksum_ctx->type)); + + /* Print out the plan for reconstructing this file. */ + initStringInfo(&debug_buf); + while (current_block < block_length) + { + rfile *s = sourcemap[current_block]; + + /* Extend range, if possible. */ + if (current_block + 1 < block_length && + s == sourcemap[current_block + 1]) + { + ++current_block; + continue; + } + + /* Add details about this range. */ + if (s == NULL) + { + if (current_block == start_of_range) + appendStringInfo(&debug_buf, " %u:zero", current_block); + else + appendStringInfo(&debug_buf, " %u-%u:zero", + start_of_range, current_block); + } + else + { + if (current_block == start_of_range) + appendStringInfo(&debug_buf, " %u:%s@" UINT64_FORMAT, + current_block, + s == NULL ? "ZERO" : s->filename, + (uint64) offsetmap[current_block]); + else + appendStringInfo(&debug_buf, " %u-%u:%s@" UINT64_FORMAT, + start_of_range, current_block, + s == NULL ? "ZERO" : s->filename, + (uint64) offsetmap[current_block]); + } + + /* Begin new range. */ + start_of_range = ++current_block; + + /* If the output is very long or we are done, dump it now. */ + if (current_block == block_length || debug_buf.len > 1024) + { + pg_log_debug("reconstruction plan:%s", debug_buf.data); + resetStringInfo(&debug_buf); + } + } + + /* Free memory. */ + pfree(debug_buf.data); + } + + /* Open the output file, except in dry_run mode. */ + if (!dry_run && + (wfd = open(output_filename, + O_RDWR | PG_BINARY | O_CREAT | O_EXCL, + pg_file_create_mode)) < 0) + pg_fatal("could not open file \"%s\": %m", output_filename); + + /* Read and write the blocks as required. */ + for (i = 0; i < block_length; ++i) + { + uint8 buffer[BLCKSZ]; + rfile *s = sourcemap[i]; + unsigned wb; + + /* Update accounting information. */ + if (s == NULL) + ++zero_blocks; + else + { + s->num_blocks_read++; + s->highest_offset_read = Max(s->highest_offset_read, + offsetmap[i] + BLCKSZ); + } + + /* Skip the rest of this in dry-run mode. */ + if (dry_run) + continue; + + /* Read or zero-fill the block as appropriate. */ + if (s == NULL) + { + /* + * New block not mentioned in the WAL summary. Should have been an + * uninitialized block, so just zero-fill it. + */ + memset(buffer, 0, BLCKSZ); + } + else + { + unsigned rb; + + /* Read the block from the correct source, except if dry-run. */ + rb = pg_pread(s->fd, buffer, BLCKSZ, offsetmap[i]); + if (rb != BLCKSZ) + { + if (rb < 0) + pg_fatal("could not read file \"%s\": %m", s->filename); + else + pg_fatal("could not read file \"%s\": read only %d of %d bytes at offset %u", + s->filename, (int) rb, BLCKSZ, + (unsigned) offsetmap[i]); + } + } + + /* Write out the block. */ + if ((wb = write(wfd, buffer, BLCKSZ)) != BLCKSZ) + { + if (wb < 0) + pg_fatal("could not write file \"%s\": %m", output_filename); + else + pg_fatal("could not write file \"%s\": wrote only %d of %d bytes", + output_filename, (int) wb, BLCKSZ); + } + + /* Update the checksum computation. */ + if (pg_checksum_update(checksum_ctx, buffer, BLCKSZ) < 0) + pg_fatal("could not update checksum of file \"%s\"", + output_filename); + } + + /* Debugging output. */ + if (zero_blocks > 0) + { + if (dry_run) + pg_log_debug("would have zero-filled %u blocks", zero_blocks); + else + pg_log_debug("zero-filled %u blocks", zero_blocks); + } + + /* Close the output file. */ + if (wfd >= 0 && close(wfd) != 0) + pg_fatal("could not close \"%s\": %m", output_filename); +} diff --git a/src/bin/pg_combinebackup/reconstruct.h b/src/bin/pg_combinebackup/reconstruct.h new file mode 100644 index 0000000000000..d689aeb5c20d1 --- /dev/null +++ b/src/bin/pg_combinebackup/reconstruct.h @@ -0,0 +1,33 @@ +/*------------------------------------------------------------------------- + * + * reconstruct.h + * Reconstruct full file from incremental file and backup chain. + * + * Copyright (c) 2017-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/bin/pg_combinebackup/reconstruct.h + * + *------------------------------------------------------------------------- + */ +#ifndef RECONSTRUCT_H +#define RECONSTRUCT_H + +#include "common/checksum_helper.h" +#include "load_manifest.h" + +extern void reconstruct_from_incremental_file(char *input_filename, + char *output_filename, + char *relative_path, + char *bare_file_name, + int n_prior_backups, + char **prior_backup_dirs, + manifest_data **manifests, + char *manifest_path, + pg_checksum_type checksum_type, + int *checksum_length, + uint8 **checksum_payload, + bool debug, + bool dry_run); + +#endif diff --git a/src/bin/pg_combinebackup/t/001_basic.pl b/src/bin/pg_combinebackup/t/001_basic.pl new file mode 100644 index 0000000000000..fb66075d1a8ed --- /dev/null +++ b/src/bin/pg_combinebackup/t/001_basic.pl @@ -0,0 +1,23 @@ +# Copyright (c) 2021-2023, PostgreSQL Global Development Group + +use strict; +use warnings; +use PostgreSQL::Test::Utils; +use Test::More; + +my $tempdir = PostgreSQL::Test::Utils::tempdir; + +program_help_ok('pg_combinebackup'); +program_version_ok('pg_combinebackup'); +program_options_handling_ok('pg_combinebackup'); + +command_fails_like( + ['pg_combinebackup'], + qr/no input directories specified/, + 'input directories must be specified'); +command_fails_like( + [ 'pg_combinebackup', $tempdir ], + qr/no output directory specified/, + 'output directory must be specified'); + +done_testing(); diff --git a/src/bin/pg_combinebackup/t/002_compare_backups.pl b/src/bin/pg_combinebackup/t/002_compare_backups.pl new file mode 100644 index 0000000000000..0b80455aff194 --- /dev/null +++ b/src/bin/pg_combinebackup/t/002_compare_backups.pl @@ -0,0 +1,154 @@ +# Copyright (c) 2021-2023, PostgreSQL Global Development Group + +use strict; +use warnings; +use File::Compare; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Set up a new database instance. +my $primary = PostgreSQL::Test::Cluster->new('primary'); +$primary->init(has_archiving => 1, allows_streaming => 1); +$primary->append_conf('postgresql.conf', 'summarize_wal = on'); +$primary->start; + +# Create some test tables, each containing one row of data, plus a whole +# extra database. +$primary->safe_psql('postgres', <backup_dir . '/backup1'; +$primary->command_ok( + [ 'pg_basebackup', '-D', $backup1path, '--no-sync', '-cfast' ], + "full backup"); + +# Now make some database changes. +$primary->safe_psql('postgres', <backup_dir . '/backup2'; +$primary->command_ok( + [ 'pg_basebackup', '-D', $backup2path, '--no-sync', '-cfast', + '--incremental', $backup1path . '/backup_manifest' ], + "incremental backup"); + +# Find an LSN to which either backup can be recovered. +my $lsn = $primary->safe_psql('postgres', "SELECT pg_current_wal_lsn();"); + +# Make sure that the WAL segment containing that LSN has been archived. +# PostgreSQL won't issue two consecutive XLOG_SWITCH records, and the backup +# just issued one, so call txid_current() to generate some WAL activity +# before calling pg_switch_wal(). +$primary->safe_psql('postgres', 'SELECT txid_current();'); +$primary->safe_psql('postgres', 'SELECT pg_switch_wal()'); + +# Now wait for the LSN we chose above to be archived. +my $archive_wait_query = + "SELECT pg_walfile_name('$lsn') <= last_archived_wal FROM pg_stat_archiver;"; +$primary->poll_query_until('postgres', $archive_wait_query) + or die "Timed out while waiting for WAL segment to be archived"; + +# Perform PITR from the full backup. Disable archive_mode so that the archive +# doesn't find out about the new timeline; that way, the later PITR below will +# choose the same timeline. +my $pitr1 = PostgreSQL::Test::Cluster->new('pitr1'); +$pitr1->init_from_backup($primary, 'backup1', + standby => 1, has_restoring => 1); +$pitr1->append_conf('postgresql.conf', qq{ +recovery_target_lsn = '$lsn' +recovery_target_action = 'promote' +archive_mode = 'off' +}); +$pitr1->start(); + +# Perform PITR to the same LSN from the incremental backup. Use the same +# basic configuration as before. +my $pitr2 = PostgreSQL::Test::Cluster->new('pitr2'); +$pitr2->init_from_backup($primary, 'backup2', + standby => 1, has_restoring => 1, + combine_with_prior => [ 'backup1' ]); +$pitr2->append_conf('postgresql.conf', qq{ +recovery_target_lsn = '$lsn' +recovery_target_action = 'promote' +archive_mode = 'off' +}); +$pitr2->start(); + +# Wait until both servers exit recovery. +$pitr1->poll_query_until('postgres', + "SELECT NOT pg_is_in_recovery();") + or die "Timed out while waiting apply to reach LSN $lsn"; +$pitr2->poll_query_until('postgres', + "SELECT NOT pg_is_in_recovery();") + or die "Timed out while waiting apply to reach LSN $lsn"; + +# Perform a logical dump of each server, and check that they match. +# It would be much nicer if we could physically compare the data files, but +# that doesn't really work. The contents of the page hole aren't guaranteed to +# be identical, and there can be other discrepancies as well. To make this work +# we'd need the equivalent of each AM's rm_mask functon written or at least +# callable from Perl, and that doesn't seem practical. +# +# NB: We're just using the primary's backup directory for scratch space here. +# This could equally well be any other directory we wanted to pick. +my $backupdir = $primary->backup_dir; +my $dump1 = $backupdir . '/pitr1.dump'; +my $dump2 = $backupdir . '/pitr2.dump'; +$pitr1->command_ok([ + 'pg_dumpall', '-f', $dump1, '--no-sync', '--no-unlogged-table-data', + '-d', $pitr1->connstr('postgres'), + ], + 'dump from PITR 1'); +$pitr1->command_ok([ + 'pg_dumpall', '-f', $dump2, '--no-sync', '--no-unlogged-table-data', + '-d', $pitr1->connstr('postgres'), + ], + 'dump from PITR 2'); + +# Compare the two dumps, there should be no differences. +my $compare_res = compare($dump1, $dump2); +note($dump1); +note($dump2); +is($compare_res, 0, "dumps are identical"); + +# Provide more context if the dumps do not match. +if ($compare_res != 0) +{ + my ($stdout, $stderr) = + run_command([ 'diff', '-u', $dump1, $dump2 ]); + print "=== diff of $dump1 and $dump2\n"; + print "=== stdout ===\n"; + print $stdout; + print "=== stderr ===\n"; + print $stderr; + print "=== EOF ===\n"; +} + +done_testing(); diff --git a/src/bin/pg_combinebackup/t/003_timeline.pl b/src/bin/pg_combinebackup/t/003_timeline.pl new file mode 100644 index 0000000000000..bc053ca5e8c25 --- /dev/null +++ b/src/bin/pg_combinebackup/t/003_timeline.pl @@ -0,0 +1,90 @@ +# Copyright (c) 2021-2023, PostgreSQL Global Development Group +# +# This test aims to validate that restoring an incremental backup works +# properly even when the reference backup is on a different timeline. + +use strict; +use warnings; +use File::Compare; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Set up a new database instance. +my $node1 = PostgreSQL::Test::Cluster->new('node1'); +$node1->init(has_archiving => 1, allows_streaming => 1); +$node1->append_conf('postgresql.conf', 'summarize_wal = on'); +$node1->start; + +# Create a table and insert a test row into it. +$node1->safe_psql('postgres', <backup_dir . '/backup1'; +$node1->command_ok( + [ 'pg_basebackup', '-D', $backup1path, '--no-sync', '-cfast' ], + "full backup from node1"); + +# Insert a second row on the original node. +$node1->safe_psql('postgres', <backup_dir . '/backup2'; +$node1->command_ok( + [ 'pg_basebackup', '-D', $backup2path, '--no-sync', '-cfast', + '--incremental', $backup1path . '/backup_manifest' ], + "incremental backup from node1"); + +# Restore the incremental backup and use it to create a new node. +my $node2 = PostgreSQL::Test::Cluster->new('node2'); +$node2->init_from_backup($node1, 'backup2', + combine_with_prior => [ 'backup1' ]); +$node2->start(); + +# Insert rows on both nodes. +$node1->safe_psql('postgres', <safe_psql('postgres', <backup_dir . '/backup3'; +$node2->command_ok( + [ 'pg_basebackup', '-D', $backup3path, '--no-sync', '-cfast', + '--incremental', $backup2path . '/backup_manifest' ], + "incremental backup from node2"); + +# Restore the incremental backup and use it to create a new node. +my $node3 = PostgreSQL::Test::Cluster->new('node3'); +$node3->init_from_backup($node1, 'backup3', + combine_with_prior => [ 'backup1', 'backup2' ]); +$node3->start(); + +# Let's insert one more row. +$node3->safe_psql('postgres', <safe_psql('postgres', <command_ok( + [ 'pg_verifybackup', $node1->backup_dir . '/' . $backup_name ], + "verify backup $backup_name"); +} + +# OK, that's all. +done_testing(); diff --git a/src/bin/pg_combinebackup/t/004_manifest.pl b/src/bin/pg_combinebackup/t/004_manifest.pl new file mode 100644 index 0000000000000..37de61ac06155 --- /dev/null +++ b/src/bin/pg_combinebackup/t/004_manifest.pl @@ -0,0 +1,75 @@ +# Copyright (c) 2021-2023, PostgreSQL Global Development Group +# +# This test aims to validate that pg_combinebackup works in the degenerate +# case where it is invoked on a single full backup and that it can produce +# a new, valid manifest when it does. Secondarily, it checks that +# pg_combinebackup does not produce a manifest when run with --no-manifest. + +use strict; +use warnings; +use File::Compare; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Set up a new database instance. +my $node = PostgreSQL::Test::Cluster->new('node'); +$node->init(has_archiving => 1, allows_streaming => 1); +$node->start; + +# Take a full backup. +my $original_backup_path = $node->backup_dir . '/original'; +$node->command_ok( + [ 'pg_basebackup', '-D', $original_backup_path, '--no-sync', '-cfast' ], + "full backup"); + +# Verify the full backup. +$node->command_ok([ 'pg_verifybackup', $original_backup_path ], + "verify original backup"); + +# Process the backup with pg_combinebackup using various manifest options. +sub combine_and_test_one_backup +{ + my ($backup_name, $failure_pattern, @extra_options) = @_; + my $revised_backup_path = $node->backup_dir . '/' . $backup_name; + $node->command_ok( + [ 'pg_combinebackup', $original_backup_path, '-o', $revised_backup_path, + '--no-sync', @extra_options ], + "pg_combinebackup with @extra_options"); + if (defined $failure_pattern) + { + $node->command_fails_like( + [ 'pg_verifybackup', $revised_backup_path ], + $failure_pattern, + "unable to verify backup $backup_name"); + } + else + { + $node->command_ok( + [ 'pg_verifybackup', $revised_backup_path ], + "verify backup $backup_name"); + } +} +combine_and_test_one_backup('nomanifest', + qr/could not open file.*backup_manifest/, '--no-manifest'); +combine_and_test_one_backup('csum_none', + undef, '--manifest-checksums=NONE'); +combine_and_test_one_backup('csum_sha224', + undef, '--manifest-checksums=SHA224'); + +# Verify that SHA224 is mentioned in the SHA224 manifest lots of times. +my $sha224_manifest = + slurp_file($node->backup_dir . '/csum_sha224/backup_manifest'); +my $sha224_count = (() = $sha224_manifest =~ /SHA224/mig); +cmp_ok($sha224_count, + '>', 100, "SHA224 is mentioned many times in SHA224 manifest"); + +# Verify that SHA224 is mentioned in the SHA224 manifest lots of times. +my $nocsum_manifest = + slurp_file($node->backup_dir . '/csum_none/backup_manifest'); +my $nocsum_count = (() = $nocsum_manifest =~ /Checksum-Algorithm/mig); +is($nocsum_count, 0, + "Checksum_Algorithm is not mentioned in no-checksum manifest"); + +# OK, that's all. +done_testing(); diff --git a/src/bin/pg_combinebackup/t/005_integrity.pl b/src/bin/pg_combinebackup/t/005_integrity.pl new file mode 100644 index 0000000000000..b1f63a43e0716 --- /dev/null +++ b/src/bin/pg_combinebackup/t/005_integrity.pl @@ -0,0 +1,125 @@ +# Copyright (c) 2021-2023, PostgreSQL Global Development Group +# +# This test aims to validate that an incremental backup can be combined +# with a valid prior backup and that it cannot be combined with an invalid +# prior backup. + +use strict; +use warnings; +use File::Compare; +use File::Path qw(rmtree); +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Set up a new database instance. +my $node1 = PostgreSQL::Test::Cluster->new('node1'); +$node1->init(has_archiving => 1, allows_streaming => 1); +$node1->append_conf('postgresql.conf', 'summarize_wal = on'); +$node1->start; + +# Set up another new database instance. We don't want to use the cached +# INITDB_TEMPLATE for this, because we want it to be a separate cluster +# with a different system ID. +my $node2; +{ + local $ENV{'INITDB_TEMPLATE'} = undef; + + $node2 = PostgreSQL::Test::Cluster->new('node2'); + $node2->init(has_archiving => 1, allows_streaming => 1); + $node2->append_conf('postgresql.conf', 'summarize_wal = on'); + $node2->start; +} + +# Take a full backup from node1. +my $backup1path = $node1->backup_dir . '/backup1'; +$node1->command_ok( + [ 'pg_basebackup', '-D', $backup1path, '--no-sync', '-cfast' ], + "full backup from node1"); + +# Now take an incremental backup. +my $backup2path = $node1->backup_dir . '/backup2'; +$node1->command_ok( + [ 'pg_basebackup', '-D', $backup2path, '--no-sync', '-cfast', + '--incremental', $backup1path . '/backup_manifest' ], + "incremental backup from node1"); + +# Now take another incremental backup. +my $backup3path = $node1->backup_dir . '/backup3'; +$node1->command_ok( + [ 'pg_basebackup', '-D', $backup3path, '--no-sync', '-cfast', + '--incremental', $backup2path . '/backup_manifest' ], + "another incremental backup from node1"); + +# Take a full backup from node2. +my $backupother1path = $node1->backup_dir . '/backupother1'; +$node2->command_ok( + [ 'pg_basebackup', '-D', $backupother1path, '--no-sync', '-cfast' ], + "full backup from node2"); + +# Take an incremental backup from node2. +my $backupother2path = $node1->backup_dir . '/backupother2'; +$node2->command_ok( + [ 'pg_basebackup', '-D', $backupother2path, '--no-sync', '-cfast', + '--incremental', $backupother1path . '/backup_manifest' ], + "incremental backup from node2"); + +# Result directory. +my $resultpath = $node1->backup_dir . '/result'; + +# Can't combine 2 full backups. +$node1->command_fails_like( + [ 'pg_combinebackup', $backup1path, $backup1path, '-o', $resultpath ], + qr/is a full backup, but only the first backup should be a full backup/, + "can't combine full backups"); + +# Can't combine 2 incremental backups. +$node1->command_fails_like( + [ 'pg_combinebackup', $backup2path, $backup2path, '-o', $resultpath ], + qr/is an incremental backup, but the first backup should be a full backup/, + "can't combine full backups"); + +# Can't combine full backup with an incremental backup from a different system. +$node1->command_fails_like( + [ 'pg_combinebackup', $backup1path, $backupother2path, '-o', $resultpath ], + qr/expected system identifier.*but found/, + "can't combine backups from different nodes"); + +# Can't omit a required backup. +$node1->command_fails_like( + [ 'pg_combinebackup', $backup1path, $backup3path, '-o', $resultpath ], + qr/starts at LSN.*but expected/, + "can't omit a required backup"); + +# Can't combine backups in the wrong order. +$node1->command_fails_like( + [ 'pg_combinebackup', $backup1path, $backup3path, $backup2path, '-o', $resultpath ], + qr/starts at LSN.*but expected/, + "can't combine backups in the wrong order"); + +# Can combine 3 backups that match up properly. +$node1->command_ok( + [ 'pg_combinebackup', $backup1path, $backup2path, $backup3path, '-o', $resultpath ], + "can combine 3 matching backups"); +rmtree($resultpath); + +# Can combine full backup with first incremental. +my $synthetic12path = $node1->backup_dir . '/synthetic12'; +$node1->command_ok( + [ 'pg_combinebackup', $backup1path, $backup2path, '-o', $synthetic12path ], + "can combine 2 matching backups"); + +# Can combine result of previous step with second incremental. +$node1->command_ok( + [ 'pg_combinebackup', $synthetic12path, $backup3path, '-o', $resultpath ], + "can combine synthetic backup with later incremental"); +rmtree($resultpath); + +# Can't combine result of 1+2 with 2. +$node1->command_fails_like( + [ 'pg_combinebackup', $synthetic12path, $backup2path, '-o', $resultpath ], + qr/starts at LSN.*but expected/, + "can't combine synthetic backup with included incremental"); + +# OK, that's all. +done_testing(); diff --git a/src/bin/pg_combinebackup/write_manifest.c b/src/bin/pg_combinebackup/write_manifest.c new file mode 100644 index 0000000000000..82160134d8bcd --- /dev/null +++ b/src/bin/pg_combinebackup/write_manifest.c @@ -0,0 +1,293 @@ +/*------------------------------------------------------------------------- + * + * Write a new backup manifest. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/bin/pg_combinebackup/write_manifest.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres_fe.h" + +#include +#include +#include + +#include "common/checksum_helper.h" +#include "common/file_perm.h" +#include "common/logging.h" +#include "lib/stringinfo.h" +#include "load_manifest.h" +#include "mb/pg_wchar.h" +#include "write_manifest.h" + +struct manifest_writer +{ + char pathname[MAXPGPATH]; + int fd; + StringInfoData buf; + bool first_file; + bool still_checksumming; + pg_checksum_context manifest_ctx; +}; + +static void escape_json(StringInfo buf, const char *str); +static void flush_manifest(manifest_writer *mwriter); +static size_t hex_encode(const uint8 *src, size_t len, char *dst); + +/* + * Create a new backup manifest writer. + * + * The backup manifest will be written into a file named backup_manifest + * in the specified directory. + */ +manifest_writer * +create_manifest_writer(char *directory) +{ + manifest_writer *mwriter = pg_malloc(sizeof(manifest_writer)); + + snprintf(mwriter->pathname, MAXPGPATH, "%s/backup_manifest", directory); + mwriter->fd = -1; + initStringInfo(&mwriter->buf); + mwriter->first_file = true; + mwriter->still_checksumming = true; + pg_checksum_init(&mwriter->manifest_ctx, CHECKSUM_TYPE_SHA256); + + appendStringInfo(&mwriter->buf, + "{ \"PostgreSQL-Backup-Manifest-Version\": 1,\n" + "\"Files\": ["); + + return mwriter; +} + +/* + * Add an entry for a file to a backup manifest. + * + * This is very similar to the backend's AddFileToBackupManifest, but + * various adjustments are required due to frontend/backend differences + * and other details. + */ +void +add_file_to_manifest(manifest_writer *mwriter, const char *manifest_path, + size_t size, pg_time_t mtime, + pg_checksum_type checksum_type, + int checksum_length, + uint8 *checksum_payload) +{ + int pathlen = strlen(manifest_path); + + if (mwriter->first_file) + { + appendStringInfoChar(&mwriter->buf, '\n'); + mwriter->first_file = false; + } + else + appendStringInfoString(&mwriter->buf, ",\n"); + + if (pg_encoding_verifymbstr(PG_UTF8, manifest_path, pathlen) == pathlen) + { + appendStringInfoString(&mwriter->buf, "{ \"Path\": "); + escape_json(&mwriter->buf, manifest_path); + appendStringInfoString(&mwriter->buf, ", "); + } + else + { + appendStringInfoString(&mwriter->buf, "{ \"Encoded-Path\": \""); + enlargeStringInfo(&mwriter->buf, 2 * pathlen); + mwriter->buf.len += hex_encode((const uint8 *) manifest_path, pathlen, + &mwriter->buf.data[mwriter->buf.len]); + appendStringInfoString(&mwriter->buf, "\", "); + } + + appendStringInfo(&mwriter->buf, "\"Size\": %zu, ", size); + + appendStringInfoString(&mwriter->buf, "\"Last-Modified\": \""); + enlargeStringInfo(&mwriter->buf, 128); + mwriter->buf.len += strftime(&mwriter->buf.data[mwriter->buf.len], 128, + "%Y-%m-%d %H:%M:%S %Z", + gmtime(&mtime)); + appendStringInfoChar(&mwriter->buf, '"'); + + if (mwriter->buf.len > 128 * 1024) + flush_manifest(mwriter); + + if (checksum_length > 0) + { + appendStringInfo(&mwriter->buf, + ", \"Checksum-Algorithm\": \"%s\", \"Checksum\": \"", + pg_checksum_type_name(checksum_type)); + + enlargeStringInfo(&mwriter->buf, 2 * checksum_length); + mwriter->buf.len += hex_encode(checksum_payload, checksum_length, + &mwriter->buf.data[mwriter->buf.len]); + + appendStringInfoChar(&mwriter->buf, '"'); + } + + appendStringInfoString(&mwriter->buf, " }"); + + if (mwriter->buf.len > 128 * 1024) + flush_manifest(mwriter); +} + +/* + * Finalize the backup_manifest. + */ +void +finalize_manifest(manifest_writer *mwriter, + manifest_wal_range *first_wal_range) +{ + uint8 checksumbuf[PG_SHA256_DIGEST_LENGTH]; + int len; + manifest_wal_range *wal_range; + + /* Terminate the list of files. */ + appendStringInfoString(&mwriter->buf, "\n],\n"); + + /* Start a list of LSN ranges. */ + appendStringInfoString(&mwriter->buf, "\"WAL-Ranges\": [\n"); + + for (wal_range = first_wal_range; wal_range != NULL; + wal_range = wal_range->next) + appendStringInfo(&mwriter->buf, + "%s{ \"Timeline\": %u, \"Start-LSN\": \"%X/%X\", \"End-LSN\": \"%X/%X\" }", + wal_range == first_wal_range ? "" : ",\n", + wal_range->tli, + LSN_FORMAT_ARGS(wal_range->start_lsn), + LSN_FORMAT_ARGS(wal_range->end_lsn)); + + /* Terminate the list of WAL ranges. */ + appendStringInfoString(&mwriter->buf, "\n],\n"); + + /* Flush accumulated data and update checksum calculation. */ + flush_manifest(mwriter); + + /* Checksum only includes data up to this point. */ + mwriter->still_checksumming = false; + + /* Compute and insert manifest checksum. */ + appendStringInfoString(&mwriter->buf, "\"Manifest-Checksum\": \""); + enlargeStringInfo(&mwriter->buf, 2 * PG_SHA256_DIGEST_STRING_LENGTH); + len = pg_checksum_final(&mwriter->manifest_ctx, checksumbuf); + Assert(len == PG_SHA256_DIGEST_LENGTH); + mwriter->buf.len += + hex_encode(checksumbuf, len, &mwriter->buf.data[mwriter->buf.len]); + appendStringInfoString(&mwriter->buf, "\"}\n"); + + /* Flush the last manifest checksum itself. */ + flush_manifest(mwriter); + + /* Close the file. */ + if (close(mwriter->fd) != 0) + pg_fatal("could not close \"%s\": %m", mwriter->pathname); + mwriter->fd = -1; +} + +/* + * Produce a JSON string literal, properly escaping characters in the text. + */ +static void +escape_json(StringInfo buf, const char *str) +{ + const char *p; + + appendStringInfoCharMacro(buf, '"'); + for (p = str; *p; p++) + { + switch (*p) + { + case '\b': + appendStringInfoString(buf, "\\b"); + break; + case '\f': + appendStringInfoString(buf, "\\f"); + break; + case '\n': + appendStringInfoString(buf, "\\n"); + break; + case '\r': + appendStringInfoString(buf, "\\r"); + break; + case '\t': + appendStringInfoString(buf, "\\t"); + break; + case '"': + appendStringInfoString(buf, "\\\""); + break; + case '\\': + appendStringInfoString(buf, "\\\\"); + break; + default: + if ((unsigned char) *p < ' ') + appendStringInfo(buf, "\\u%04x", (int) *p); + else + appendStringInfoCharMacro(buf, *p); + break; + } + } + appendStringInfoCharMacro(buf, '"'); +} + +/* + * Flush whatever portion of the backup manifest we have generated and + * buffered in memory out to a file on disk. + * + * The first call to this function will create the file. After that, we + * keep it open and just append more data. + */ +static void +flush_manifest(manifest_writer *mwriter) +{ + char pathname[MAXPGPATH]; + + if (mwriter->fd == -1 && + (mwriter->fd = open(mwriter->pathname, + O_WRONLY | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + pg_fatal("could not open file \"%s\": %m", mwriter->pathname); + + if (mwriter->buf.len > 0) + { + ssize_t wb; + + wb = write(mwriter->fd, mwriter->buf.data, mwriter->buf.len); + if (wb != mwriter->buf.len) + { + if (wb < 0) + pg_fatal("could not write \"%s\": %m", mwriter->pathname); + else + pg_fatal("could not write file \"%s\": wrote only %d of %d bytes", + pathname, (int) wb, mwriter->buf.len); + } + + if (mwriter->still_checksumming) + pg_checksum_update(&mwriter->manifest_ctx, + (uint8 *) mwriter->buf.data, + mwriter->buf.len); + resetStringInfo(&mwriter->buf); + } +} + +/* + * Encode bytes using two hexademical digits for each one. + */ +static size_t +hex_encode(const uint8 *src, size_t len, char *dst) +{ + const uint8 *end = src + len; + + while (src < end) + { + unsigned n1 = (*src >> 4) & 0xF; + unsigned n2 = *src & 0xF; + + *dst++ = n1 < 10 ? '0' + n1 : 'a' + n1 - 10; + *dst++ = n2 < 10 ? '0' + n2 : 'a' + n2 - 10; + ++src; + } + + return len * 2; +} diff --git a/src/bin/pg_combinebackup/write_manifest.h b/src/bin/pg_combinebackup/write_manifest.h new file mode 100644 index 0000000000000..8fd7fe02c87b0 --- /dev/null +++ b/src/bin/pg_combinebackup/write_manifest.h @@ -0,0 +1,33 @@ +/*------------------------------------------------------------------------- + * + * Write a new backup manifest. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/bin/pg_combinebackup/write_manifest.h + * + *------------------------------------------------------------------------- + */ +#ifndef WRITE_MANIFEST_H +#define WRITE_MANIFEST_H + +#include "common/checksum_helper.h" +#include "pgtime.h" + +struct manifest_wal_range; + +struct manifest_writer; +typedef struct manifest_writer manifest_writer; + +extern manifest_writer *create_manifest_writer(char *directory); +extern void add_file_to_manifest(manifest_writer *mwriter, + const char *manifest_path, + size_t size, pg_time_t mtime, + pg_checksum_type checksum_type, + int checksum_length, + uint8 *checksum_payload); +extern void finalize_manifest(manifest_writer *mwriter, + struct manifest_wal_range *first_wal_range); + +#endif /* WRITE_MANIFEST_H */ diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c index 3ae3fc06df20d..5407f51a4e74e 100644 --- a/src/bin/pg_resetwal/pg_resetwal.c +++ b/src/bin/pg_resetwal/pg_resetwal.c @@ -85,6 +85,7 @@ static void RewriteControlFile(void); static void FindEndOfXLOG(void); static void KillExistingXLOG(void); static void KillExistingArchiveStatus(void); +static void KillExistingWALSummaries(void); static void WriteEmptyXLOG(void); static void usage(void); @@ -493,6 +494,7 @@ main(int argc, char *argv[]) RewriteControlFile(); KillExistingXLOG(); KillExistingArchiveStatus(); + KillExistingWALSummaries(); WriteEmptyXLOG(); printf(_("Write-ahead log reset\n")); @@ -1034,6 +1036,40 @@ KillExistingArchiveStatus(void) pg_fatal("could not close directory \"%s\": %m", ARCHSTATDIR); } +/* + * Remove existing WAL summary files + */ +static void +KillExistingWALSummaries(void) +{ +#define WALSUMMARYDIR XLOGDIR "/summaries" +#define WALSUMMARY_NHEXCHARS 40 + + DIR *xldir; + struct dirent *xlde; + char path[MAXPGPATH + sizeof(WALSUMMARYDIR)]; + + xldir = opendir(WALSUMMARYDIR); + if (xldir == NULL) + pg_fatal("could not open directory \"%s\": %m", WALSUMMARYDIR); + + while (errno = 0, (xlde = readdir(xldir)) != NULL) + { + if (strspn(xlde->d_name, "0123456789ABCDEF") == WALSUMMARY_NHEXCHARS && + strcmp(xlde->d_name + WALSUMMARY_NHEXCHARS, ".summary") == 0) + { + snprintf(path, sizeof(path), "%s/%s", WALSUMMARYDIR, xlde->d_name); + if (unlink(path) < 0) + pg_fatal("could not delete file \"%s\": %m", path); + } + } + + if (errno) + pg_fatal("could not read directory \"%s\": %m", WALSUMMARYDIR); + + if (closedir(xldir)) + pg_fatal("could not close directory \"%s\": %m", ARCHSTATDIR); +} /* * Write an empty XLOG file, containing only the checkpoint record diff --git a/src/include/access/xlogbackup.h b/src/include/access/xlogbackup.h index 1611358137b8d..90e04cad56993 100644 --- a/src/include/access/xlogbackup.h +++ b/src/include/access/xlogbackup.h @@ -28,6 +28,8 @@ typedef struct BackupState XLogRecPtr checkpointloc; /* last checkpoint location */ pg_time_t starttime; /* backup start time */ bool started_in_recovery; /* backup started in recovery? */ + XLogRecPtr istartpoint; /* incremental based on backup at this LSN */ + TimeLineID istarttli; /* incremental based on backup on this TLI */ /* Fields saved at the end of backup */ XLogRecPtr stoppoint; /* backup stop WAL location */ diff --git a/src/include/backup/basebackup.h b/src/include/backup/basebackup.h index 1432d9c206bbc..345bd22534c11 100644 --- a/src/include/backup/basebackup.h +++ b/src/include/backup/basebackup.h @@ -34,6 +34,9 @@ typedef struct int64 size; /* total size as sent; -1 if not known */ } tablespaceinfo; -extern void SendBaseBackup(BaseBackupCmd *cmd); +struct IncrementalBackupInfo; + +extern void SendBaseBackup(BaseBackupCmd *cmd, + struct IncrementalBackupInfo *ib); #endif /* _BASEBACKUP_H */ diff --git a/src/include/backup/basebackup_incremental.h b/src/include/backup/basebackup_incremental.h new file mode 100644 index 0000000000000..de99117599e77 --- /dev/null +++ b/src/include/backup/basebackup_incremental.h @@ -0,0 +1,55 @@ +/*------------------------------------------------------------------------- + * + * basebackup_incremental.h + * API for incremental backup support + * + * Portions Copyright (c) 2010-2022, PostgreSQL Global Development Group + * + * src/include/backup/basebackup_incremental.h + * + *------------------------------------------------------------------------- + */ +#ifndef BASEBACKUP_INCREMENTAL_H +#define BASEBACKUP_INCREMENTAL_H + +#include "access/xlogbackup.h" +#include "common/relpath.h" +#include "storage/block.h" +#include "utils/palloc.h" + +#define INCREMENTAL_MAGIC 0xd3ae1f0d + +typedef enum +{ + BACK_UP_FILE_FULLY, + BACK_UP_FILE_INCREMENTALLY +} FileBackupMethod; + +struct IncrementalBackupInfo; +typedef struct IncrementalBackupInfo IncrementalBackupInfo; + +extern IncrementalBackupInfo *CreateIncrementalBackupInfo(MemoryContext); + +extern void AppendIncrementalManifestData(IncrementalBackupInfo *ib, + const char *data, + int len); +extern void FinalizeIncrementalManifest(IncrementalBackupInfo *ib); + +extern void PrepareForIncrementalBackup(IncrementalBackupInfo *ib, + BackupState *backup_state); + +extern char *GetIncrementalFilePath(Oid dboid, Oid spcoid, + RelFileNumber relfilenumber, + ForkNumber forknum, unsigned segno); +extern FileBackupMethod GetFileBackupMethod(IncrementalBackupInfo *ib, + const char *path, + Oid dboid, Oid spcoid, + RelFileNumber relfilenumber, + ForkNumber forknum, + unsigned segno, size_t size, + unsigned *num_blocks_required, + BlockNumber *relative_block_numbers, + unsigned *truncation_block_length); +extern size_t GetIncrementalFileSize(unsigned num_blocks_required); + +#endif diff --git a/src/include/nodes/replnodes.h b/src/include/nodes/replnodes.h index 5142a08729173..c98961c329fcb 100644 --- a/src/include/nodes/replnodes.h +++ b/src/include/nodes/replnodes.h @@ -108,4 +108,13 @@ typedef struct TimeLineHistoryCmd TimeLineID timeline; } TimeLineHistoryCmd; +/* ---------------------- + * UPLOAD_MANIFEST command + * ---------------------- + */ +typedef struct UploadManifestCmd +{ + NodeTag type; +} UploadManifestCmd; + #endif /* REPLNODES_H */ diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm index a020377761dff..46cb2a65500e4 100644 --- a/src/test/perl/PostgreSQL/Test/Cluster.pm +++ b/src/test/perl/PostgreSQL/Test/Cluster.pm @@ -779,6 +779,10 @@ a tar-format backup, pass the name of the tar program to use in the keyword parameter tar_program. Note that tablespace tar files aren't handled here. +To restore from an incremental backup, pass the parameter combine_with_prior +as a reference to an array of prior backup names with which this backup +is to be combined using pg_combinebackup. + Streaming replication can be enabled on this node by passing the keyword parameter has_streaming => 1. This is disabled by default. @@ -816,7 +820,22 @@ sub init_from_backup mkdir $self->archive_dir; my $data_path = $self->data_dir; - if (defined $params{tar_program}) + if (defined $params{combine_with_prior}) + { + my @prior_backups = @{$params{combine_with_prior}}; + my @prior_backup_path; + + for my $prior_backup_name (@prior_backups) + { + push @prior_backup_path, + $root_node->backup_dir . '/' . $prior_backup_name; + } + + local %ENV = $self->_get_env(); + PostgreSQL::Test::Utils::system_or_bail('pg_combinebackup', '-d', + @prior_backup_path, $backup_path, '-o', $data_path); + } + elsif (defined $params{tar_program}) { mkdir($data_path); PostgreSQL::Test::Utils::system_or_bail($params{tar_program}, 'xf', diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 9390049314250..e37ef9aa76da0 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -4023,3 +4023,15 @@ SummarizerReadLocalXLogPrivate WalSummarizerData WalSummaryFile WalSummaryIO +FileBackupMethod +IncrementalBackupInfo +UploadManifestCmd +backup_file_entry +backup_wal_range +cb_cleanup_dir +cb_options +cb_tablespace +cb_tablespace_mapping +manifest_data +manifest_writer +rfile