Permalink
Find file Copy path
779 lines (661 sloc) 20.5 KB
/*-------------------------------------------------------------------------
*
* xlogfuncs.c
*
* PostgreSQL write-ahead log manager user interface functions
*
* This file contains WAL control and information functions.
*
*
* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/backend/access/transam/xlogfuncs.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <unistd.h>
#include "access/htup_details.h"
#include "access/xlog.h"
#include "access/xlog_internal.h"
#include "access/xlogutils.h"
#include "catalog/pg_type.h"
#include "funcapi.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "replication/walreceiver.h"
#include "storage/smgr.h"
#include "utils/builtins.h"
#include "utils/memutils.h"
#include "utils/numeric.h"
#include "utils/guc.h"
#include "utils/pg_lsn.h"
#include "utils/timestamp.h"
#include "utils/tuplestore.h"
#include "storage/fd.h"
#include "storage/ipc.h"
/*
* Store label file and tablespace map during non-exclusive backups.
*/
static StringInfo label_file;
static StringInfo tblspc_map_file;
/*
* Called when the backend exits with a running non-exclusive base backup,
* to clean up state.
*/
static void
nonexclusive_base_backup_cleanup(int code, Datum arg)
{
do_pg_abort_backup();
ereport(WARNING,
(errmsg("aborting backup due to backend exiting before pg_stop_backup was called")));
}
/*
* pg_start_backup: set up for taking an on-line backup dump
*
* Essentially what this does is to create a backup label file in $PGDATA,
* where it will be archived as part of the backup dump. The label file
* contains the user-supplied label string (typically this would be used
* to tell where the backup dump will be stored) and the starting time and
* starting WAL location for the dump.
*
* Permission checking for this function is managed through the normal
* GRANT system.
*/
Datum
pg_start_backup(PG_FUNCTION_ARGS)
{
text *backupid = PG_GETARG_TEXT_PP(0);
bool fast = PG_GETARG_BOOL(1);
bool exclusive = PG_GETARG_BOOL(2);
char *backupidstr;
XLogRecPtr startpoint;
SessionBackupState status = get_backup_status();
backupidstr = text_to_cstring(backupid);
if (status == SESSION_BACKUP_NON_EXCLUSIVE)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("a backup is already in progress in this session")));
if (exclusive)
{
startpoint = do_pg_start_backup(backupidstr, fast, NULL, NULL,
NULL, NULL, false, true);
}
else
{
MemoryContext oldcontext;
/*
* Label file and tablespace map file need to be long-lived, since
* they are read in pg_stop_backup.
*/
oldcontext = MemoryContextSwitchTo(TopMemoryContext);
label_file = makeStringInfo();
tblspc_map_file = makeStringInfo();
MemoryContextSwitchTo(oldcontext);
startpoint = do_pg_start_backup(backupidstr, fast, NULL, label_file,
NULL, tblspc_map_file, false, true);
before_shmem_exit(nonexclusive_base_backup_cleanup, (Datum) 0);
}
PG_RETURN_LSN(startpoint);
}
/*
* pg_stop_backup: finish taking an on-line backup dump
*
* We write an end-of-backup WAL record, and remove the backup label file
* created by pg_start_backup, creating a backup history file in pg_wal
* instead (whence it will immediately be archived). The backup history file
* contains the same info found in the label file, plus the backup-end time
* and WAL location. Before 9.0, the backup-end time was read from the backup
* history file at the beginning of archive recovery, but we now use the WAL
* record for that and the file is for informational and debug purposes only.
*
* Note: different from CancelBackup which just cancels online backup mode.
*
* Note: this version is only called to stop an exclusive backup. The function
* pg_stop_backup_v2 (overloaded as pg_stop_backup in SQL) is called to
* stop non-exclusive backups.
*
* Permission checking for this function is managed through the normal
* GRANT system.
*/
Datum
pg_stop_backup(PG_FUNCTION_ARGS)
{
XLogRecPtr stoppoint;
SessionBackupState status = get_backup_status();
if (status == SESSION_BACKUP_NON_EXCLUSIVE)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("non-exclusive backup in progress"),
errhint("Did you mean to use pg_stop_backup('f')?")));
/*
* Exclusive backups were typically started in a different connection, so
* don't try to verify that status of backup is set to
* SESSION_BACKUP_EXCLUSIVE in this function. Actual verification that an
* exclusive backup is in fact running is handled inside
* do_pg_stop_backup.
*/
stoppoint = do_pg_stop_backup(NULL, true, NULL);
PG_RETURN_LSN(stoppoint);
}
/*
* pg_stop_backup_v2: finish taking exclusive or nonexclusive on-line backup.
*
* Works the same as pg_stop_backup, except for non-exclusive backups it returns
* the backup label and tablespace map files as text fields in as part of the
* resultset.
*
* The first parameter (variable 'exclusive') allows the user to tell us if
* this is an exclusive or a non-exclusive backup.
*
* The second parameter (variable 'waitforarchive'), which is optional,
* allows the user to choose if they want to wait for the WAL to be archived
* or if we should just return as soon as the WAL record is written.
*
* Permission checking for this function is managed through the normal
* GRANT system.
*/
Datum
pg_stop_backup_v2(PG_FUNCTION_ARGS)
{
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
TupleDesc tupdesc;
Tuplestorestate *tupstore;
MemoryContext per_query_ctx;
MemoryContext oldcontext;
Datum values[3];
bool nulls[3];
bool exclusive = PG_GETARG_BOOL(0);
bool waitforarchive = PG_GETARG_BOOL(1);
XLogRecPtr stoppoint;
SessionBackupState status = get_backup_status();
/* check to see if caller supports us returning a tuplestore */
if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("set-valued function called in context that cannot accept a set")));
if (!(rsinfo->allowedModes & SFRM_Materialize))
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("materialize mode required, but it is not " \
"allowed in this context")));
/* Build a tuple descriptor for our result type */
if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
elog(ERROR, "return type must be a row type");
per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
oldcontext = MemoryContextSwitchTo(per_query_ctx);
tupstore = tuplestore_begin_heap(true, false, work_mem);
rsinfo->returnMode = SFRM_Materialize;
rsinfo->setResult = tupstore;
rsinfo->setDesc = tupdesc;
MemoryContextSwitchTo(oldcontext);
MemSet(values, 0, sizeof(values));
MemSet(nulls, 0, sizeof(nulls));
if (exclusive)
{
if (status == SESSION_BACKUP_NON_EXCLUSIVE)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("non-exclusive backup in progress"),
errhint("Did you mean to use pg_stop_backup('f')?")));
/*
* Stop the exclusive backup, and since we're in an exclusive backup
* return NULL for both backup_label and tablespace_map.
*/
stoppoint = do_pg_stop_backup(NULL, waitforarchive, NULL);
nulls[1] = true;
nulls[2] = true;
}
else
{
if (status != SESSION_BACKUP_NON_EXCLUSIVE)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("non-exclusive backup is not in progress"),
errhint("Did you mean to use pg_stop_backup('t')?")));
/*
* Stop the non-exclusive backup. Return a copy of the backup label
* and tablespace map so they can be written to disk by the caller.
*/
stoppoint = do_pg_stop_backup(label_file->data, waitforarchive, NULL);
cancel_before_shmem_exit(nonexclusive_base_backup_cleanup, (Datum) 0);
values[1] = CStringGetTextDatum(label_file->data);
values[2] = CStringGetTextDatum(tblspc_map_file->data);
/* Free structures allocated in TopMemoryContext */
pfree(label_file->data);
pfree(label_file);
label_file = NULL;
pfree(tblspc_map_file->data);
pfree(tblspc_map_file);
tblspc_map_file = NULL;
}
/* Stoppoint is included on both exclusive and nonexclusive backups */
values[0] = LSNGetDatum(stoppoint);
tuplestore_putvalues(tupstore, tupdesc, values, nulls);
tuplestore_donestoring(typstore);
return (Datum) 0;
}
/*
* pg_switch_wal: switch to next xlog file
*
* Permission checking for this function is managed through the normal
* GRANT system.
*/
Datum
pg_switch_wal(PG_FUNCTION_ARGS)
{
XLogRecPtr switchpoint;
if (RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is in progress"),
errhint("WAL control functions cannot be executed during recovery.")));
switchpoint = RequestXLogSwitch(false);
/*
* As a convenience, return the WAL location of the switch record
*/
PG_RETURN_LSN(switchpoint);
}
/*
* pg_create_restore_point: a named point for restore
*
* Permission checking for this function is managed through the normal
* GRANT system.
*/
Datum
pg_create_restore_point(PG_FUNCTION_ARGS)
{
text *restore_name = PG_GETARG_TEXT_PP(0);
char *restore_name_str;
XLogRecPtr restorepoint;
if (RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
(errmsg("recovery is in progress"),
errhint("WAL control functions cannot be executed during recovery."))));
if (!XLogIsNeeded())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("WAL level not sufficient for creating a restore point"),
errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
restore_name_str = text_to_cstring(restore_name);
if (strlen(restore_name_str) >= MAXFNAMELEN)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("value too long for restore point (maximum %d characters)", MAXFNAMELEN - 1)));
restorepoint = XLogRestorePoint(restore_name_str);
/*
* As a convenience, return the WAL location of the restore point record
*/
PG_RETURN_LSN(restorepoint);
}
/*
* Report the current WAL write location (same format as pg_start_backup etc)
*
* This is useful for determining how much of WAL is visible to an external
* archiving process. Note that the data before this point is written out
* to the kernel, but is not necessarily synced to disk.
*/
Datum
pg_current_wal_lsn(PG_FUNCTION_ARGS)
{
XLogRecPtr current_recptr;
if (RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is in progress"),
errhint("WAL control functions cannot be executed during recovery.")));
current_recptr = GetXLogWriteRecPtr();
PG_RETURN_LSN(current_recptr);
}
/*
* Report the current WAL insert location (same format as pg_start_backup etc)
*
* This function is mostly for debugging purposes.
*/
Datum
pg_current_wal_insert_lsn(PG_FUNCTION_ARGS)
{
XLogRecPtr current_recptr;
if (RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is in progress"),
errhint("WAL control functions cannot be executed during recovery.")));
current_recptr = GetXLogInsertRecPtr();
PG_RETURN_LSN(current_recptr);
}
/*
* Report the current WAL flush location (same format as pg_start_backup etc)
*
* This function is mostly for debugging purposes.
*/
Datum
pg_current_wal_flush_lsn(PG_FUNCTION_ARGS)
{
XLogRecPtr current_recptr;
if (RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is in progress"),
errhint("WAL control functions cannot be executed during recovery.")));
current_recptr = GetFlushRecPtr();
PG_RETURN_LSN(current_recptr);
}
/*
* Report the last WAL receive location (same format as pg_start_backup etc)
*
* This is useful for determining how much of WAL is guaranteed to be received
* and synced to disk by walreceiver.
*/
Datum
pg_last_wal_receive_lsn(PG_FUNCTION_ARGS)
{
XLogRecPtr recptr;
recptr = GetWalRcvWriteRecPtr(NULL, NULL);
if (recptr == 0)
PG_RETURN_NULL();
PG_RETURN_LSN(recptr);
}
/*
* Report the last WAL replay location (same format as pg_start_backup etc)
*
* This is useful for determining how much of WAL is visible to read-only
* connections during recovery.
*/
Datum
pg_last_wal_replay_lsn(PG_FUNCTION_ARGS)
{
XLogRecPtr recptr;
recptr = GetXLogReplayRecPtr(NULL);
if (recptr == 0)
PG_RETURN_NULL();
PG_RETURN_LSN(recptr);
}
/*
* Compute an xlog file name and decimal byte offset given a WAL location,
* such as is returned by pg_stop_backup() or pg_switch_wal().
*
* Note that a location exactly at a segment boundary is taken to be in
* the previous segment. This is usually the right thing, since the
* expected usage is to determine which xlog file(s) are ready to archive.
*/
Datum
pg_walfile_name_offset(PG_FUNCTION_ARGS)
{
XLogSegNo xlogsegno;
uint32 xrecoff;
XLogRecPtr locationpoint = PG_GETARG_LSN(0);
char xlogfilename[MAXFNAMELEN];
Datum values[2];
bool isnull[2];
TupleDesc resultTupleDesc;
HeapTuple resultHeapTuple;
Datum result;
if (RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is in progress"),
errhint("%s cannot be executed during recovery.",
"pg_walfile_name_offset()")));
/*
* Construct a tuple descriptor for the result row. This must match this
* function's pg_proc entry!
*/
resultTupleDesc = CreateTemplateTupleDesc(2);
TupleDescInitEntry(resultTupleDesc, (AttrNumber) 1, "file_name",
TEXTOID, -1, 0);
TupleDescInitEntry(resultTupleDesc, (AttrNumber) 2, "file_offset",
INT4OID, -1, 0);
resultTupleDesc = BlessTupleDesc(resultTupleDesc);
/*
* xlogfilename
*/
XLByteToPrevSeg(locationpoint, xlogsegno, wal_segment_size);
XLogFileName(xlogfilename, ThisTimeLineID, xlogsegno, wal_segment_size);
values[0] = CStringGetTextDatum(xlogfilename);
isnull[0] = false;
/*
* offset
*/
xrecoff = XLogSegmentOffset(locationpoint, wal_segment_size);
values[1] = UInt32GetDatum(xrecoff);
isnull[1] = false;
/*
* Tuple jam: Having first prepared your Datums, then squash together
*/
resultHeapTuple = heap_form_tuple(resultTupleDesc, values, isnull);
result = HeapTupleGetDatum(resultHeapTuple);
PG_RETURN_DATUM(result);
}
/*
* Compute an xlog file name given a WAL location,
* such as is returned by pg_stop_backup() or pg_switch_wal().
*/
Datum
pg_walfile_name(PG_FUNCTION_ARGS)
{
XLogSegNo xlogsegno;
XLogRecPtr locationpoint = PG_GETARG_LSN(0);
char xlogfilename[MAXFNAMELEN];
if (RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is in progress"),
errhint("%s cannot be executed during recovery.",
"pg_walfile_name()")));
XLByteToPrevSeg(locationpoint, xlogsegno, wal_segment_size);
XLogFileName(xlogfilename, ThisTimeLineID, xlogsegno, wal_segment_size);
PG_RETURN_TEXT_P(cstring_to_text(xlogfilename));
}
/*
* pg_wal_replay_pause - pause recovery now
*
* Permission checking for this function is managed through the normal
* GRANT system.
*/
Datum
pg_wal_replay_pause(PG_FUNCTION_ARGS)
{
if (!RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is not in progress"),
errhint("Recovery control functions can only be executed during recovery.")));
SetRecoveryPause(true);
PG_RETURN_VOID();
}
/*
* pg_wal_replay_resume - resume recovery now
*
* Permission checking for this function is managed through the normal
* GRANT system.
*/
Datum
pg_wal_replay_resume(PG_FUNCTION_ARGS)
{
if (!RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is not in progress"),
errhint("Recovery control functions can only be executed during recovery.")));
SetRecoveryPause(false);
PG_RETURN_VOID();
}
/*
* pg_is_wal_replay_paused
*/
Datum
pg_is_wal_replay_paused(PG_FUNCTION_ARGS)
{
if (!RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is not in progress"),
errhint("Recovery control functions can only be executed during recovery.")));
PG_RETURN_BOOL(RecoveryIsPaused());
}
/*
* Returns timestamp of latest processed commit/abort record.
*
* When the server has been started normally without recovery the function
* returns NULL.
*/
Datum
pg_last_xact_replay_timestamp(PG_FUNCTION_ARGS)
{
TimestampTz xtime;
xtime = GetLatestXTime();
if (xtime == 0)
PG_RETURN_NULL();
PG_RETURN_TIMESTAMPTZ(xtime);
}
/*
* Returns bool with current recovery mode, a global state.
*/
Datum
pg_is_in_recovery(PG_FUNCTION_ARGS)
{
PG_RETURN_BOOL(RecoveryInProgress());
}
/*
* Compute the difference in bytes between two WAL locations.
*/
Datum
pg_wal_lsn_diff(PG_FUNCTION_ARGS)
{
Datum result;
result = DirectFunctionCall2(pg_lsn_mi,
PG_GETARG_DATUM(0),
PG_GETARG_DATUM(1));
PG_RETURN_NUMERIC(result);
}
/*
* Returns bool with current on-line backup mode, a global state.
*/
Datum
pg_is_in_backup(PG_FUNCTION_ARGS)
{
PG_RETURN_BOOL(BackupInProgress());
}
/*
* Returns start time of an online exclusive backup.
*
* When there's no exclusive backup in progress, the function
* returns NULL.
*/
Datum
pg_backup_start_time(PG_FUNCTION_ARGS)
{
Datum xtime;
FILE *lfp;
char fline[MAXPGPATH];
char backup_start_time[30];
/*
* See if label file is present
*/
lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
if (lfp == NULL)
{
if (errno != ENOENT)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not read file \"%s\": %m",
BACKUP_LABEL_FILE)));
PG_RETURN_NULL();
}
/*
* Parse the file to find the START TIME line.
*/
backup_start_time[0] = '\0';
while (fgets(fline, sizeof(fline), lfp) != NULL)
{
if (sscanf(fline, "START TIME: %25[^\n]\n", backup_start_time) == 1)
break;
}
/* Check for a read error. */
if (ferror(lfp))
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not read file \"%s\": %m", BACKUP_LABEL_FILE)));
/* Close the backup label file. */
if (FreeFile(lfp))
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not close file \"%s\": %m", BACKUP_LABEL_FILE)));
if (strlen(backup_start_time) == 0)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
/*
* Convert the time string read from file to TimestampTz form.
*/
xtime = DirectFunctionCall3(timestamptz_in,
CStringGetDatum(backup_start_time),
ObjectIdGetDatum(InvalidOid),
Int32GetDatum(-1));
PG_RETURN_DATUM(xtime);
}
/*
* Promotes a standby server.
*
* A result of "true" means that promotion has been completed if "wait" is
* "true", or initiated if "wait" is false.
*/
Datum
pg_promote(PG_FUNCTION_ARGS)
{
bool wait = PG_GETARG_BOOL(0);
int wait_seconds = PG_GETARG_INT32(1);
FILE *promote_file;
int i;
if (!RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is not in progress"),
errhint("Recovery control functions can only be executed during recovery.")));
if (wait_seconds <= 0)
ereport(ERROR,
(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
errmsg("\"wait_seconds\" cannot be negative or equal zero")));
/* create the promote signal file */
promote_file = AllocateFile(PROMOTE_SIGNAL_FILE, "w");
if (!promote_file)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not create file \"%s\": %m",
PROMOTE_SIGNAL_FILE)));
if (FreeFile(promote_file))
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not write file \"%s\": %m",
PROMOTE_SIGNAL_FILE)));
/* signal the postmaster */
if (kill(PostmasterPid, SIGUSR1) != 0)
{
ereport(WARNING,
(errmsg("failed to send signal to postmaster: %m")));
(void) unlink(PROMOTE_SIGNAL_FILE);
PG_RETURN_BOOL(false);
}
/* return immediately if waiting was not requested */
if (!wait)
PG_RETURN_BOOL(true);
/* wait for the amount of time wanted until promotion */
#define WAITS_PER_SECOND 10
for (i = 0; i < WAITS_PER_SECOND * wait_seconds; i++)
{
ResetLatch(MyLatch);
if (!RecoveryInProgress())
PG_RETURN_BOOL(true);
CHECK_FOR_INTERRUPTS();
(void) WaitLatch(MyLatch,
WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
1000L / WAITS_PER_SECOND,
WAIT_EVENT_PROMOTE);
}
ereport(WARNING,
(errmsg("server did not promote within %d seconds", wait_seconds)));
PG_RETURN_BOOL(false);
}