Skip to content

Commit 1584caa

Browse files
committed
Merge branch 'release_2_4' into issue_66
2 parents 45995f0 + fd36a51 commit 1584caa

File tree

4 files changed

+174
-61
lines changed

4 files changed

+174
-61
lines changed

src/backup.c

Lines changed: 78 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -412,10 +412,10 @@ do_backup_instance(PGconn *backup_conn, PGNodeInfo *nodeInfo, bool no_sync, bool
412412

413413
if (current.backup_mode != BACKUP_MODE_FULL)
414414
{
415-
elog(LOG, "current_tli:%X", current.tli);
416-
elog(LOG, "prev_backup->start_lsn: %X/%X",
415+
elog(LOG, "Current tli: %X", current.tli);
416+
elog(LOG, "Parent start_lsn: %X/%X",
417417
(uint32) (prev_backup->start_lsn >> 32), (uint32) (prev_backup->start_lsn));
418-
elog(LOG, "current.start_lsn: %X/%X",
418+
elog(LOG, "start_lsn: %X/%X",
419419
(uint32) (current.start_lsn >> 32), (uint32) (current.start_lsn));
420420
}
421421

@@ -573,9 +573,6 @@ do_backup_instance(PGconn *backup_conn, PGNodeInfo *nodeInfo, bool no_sync, bool
573573
/* Notify end of backup */
574574
pg_stop_backup(&current, pg_startbackup_conn, nodeInfo);
575575

576-
elog(LOG, "current.stop_lsn: %X/%X",
577-
(uint32) (stop_backup_lsn >> 32), (uint32) (stop_backup_lsn));
578-
579576
/* In case of backup from replica >= 9.6 we must fix minRecPoint,
580577
* First we must find pg_control in backup_files_list.
581578
*/
@@ -1732,65 +1729,66 @@ pg_stop_backup(pgBackup *backup, PGconn *pg_startbackup_conn,
17321729
/* Calculate LSN */
17331730
stop_backup_lsn_tmp = ((uint64) lsn_hi) << 32 | lsn_lo;
17341731

1732+
/* It is ok for replica to return invalid STOP LSN
1733+
* UPD: Apparently it is ok even for a master.
1734+
*/
17351735
if (!XRecOffIsValid(stop_backup_lsn_tmp))
17361736
{
1737-
/* It is ok for replica to return STOP LSN with NullXRecOff
1738-
* UPD: Apparently it is ok even for master.
1739-
*/
1740-
if (XRecOffIsNull(stop_backup_lsn_tmp))
1741-
{
1742-
char *xlog_path,
1743-
stream_xlog_path[MAXPGPATH];
1744-
XLogSegNo segno = 0;
1745-
XLogRecPtr lsn_tmp = InvalidXLogRecPtr;
1737+
char *xlog_path,
1738+
stream_xlog_path[MAXPGPATH];
1739+
XLogSegNo segno = 0;
1740+
XLogRecPtr lsn_tmp = InvalidXLogRecPtr;
17461741

1747-
/*
1748-
* Even though the value is invalid, it's expected postgres behaviour
1749-
* and we're trying to fix it below.
1750-
*/
1751-
elog(LOG, "Null offset in stop_backup_lsn value %X/%X, trying to fix",
1752-
(uint32) (stop_backup_lsn_tmp >> 32), (uint32) (stop_backup_lsn_tmp));
1742+
/*
1743+
* Even though the value is invalid, it's expected postgres behaviour
1744+
* and we're trying to fix it below.
1745+
*/
1746+
elog(LOG, "Invalid offset in stop_lsn value %X/%X, trying to fix",
1747+
(uint32) (stop_backup_lsn_tmp >> 32), (uint32) (stop_backup_lsn_tmp));
17531748

1754-
/*
1755-
* Note: even with gdb it is very hard to produce automated tests for
1756-
* contrecord + NullXRecOff, so emulate it for manual testing.
1757-
*/
1758-
//stop_backup_lsn_tmp = stop_backup_lsn_tmp - XLOG_SEG_SIZE;
1759-
//elog(WARNING, "New Invalid stop_backup_lsn value %X/%X",
1760-
// (uint32) (stop_backup_lsn_tmp >> 32), (uint32) (stop_backup_lsn_tmp));
1749+
/*
1750+
* Note: even with gdb it is very hard to produce automated tests for
1751+
* contrecord + invalid LSN, so emulate it for manual testing.
1752+
*/
1753+
//stop_backup_lsn_tmp = stop_backup_lsn_tmp - XLOG_SEG_SIZE;
1754+
//elog(WARNING, "New Invalid stop_backup_lsn value %X/%X",
1755+
// (uint32) (stop_backup_lsn_tmp >> 32), (uint32) (stop_backup_lsn_tmp));
17611756

1762-
if (stream_wal)
1763-
{
1764-
pgBackupGetPath2(backup, stream_xlog_path,
1765-
lengthof(stream_xlog_path),
1766-
DATABASE_DIR, PG_XLOG_DIR);
1767-
xlog_path = stream_xlog_path;
1768-
}
1769-
else
1770-
xlog_path = arclog_path;
1757+
if (stream_wal)
1758+
{
1759+
pgBackupGetPath2(backup, stream_xlog_path,
1760+
lengthof(stream_xlog_path),
1761+
DATABASE_DIR, PG_XLOG_DIR);
1762+
xlog_path = stream_xlog_path;
1763+
}
1764+
else
1765+
xlog_path = arclog_path;
17711766

1772-
GetXLogSegNo(stop_backup_lsn_tmp, segno, instance_config.xlog_seg_size);
1767+
GetXLogSegNo(stop_backup_lsn_tmp, segno, instance_config.xlog_seg_size);
17731768

1774-
/*
1775-
* Note, that there is no guarantee that corresponding WAL file even exists.
1776-
* Replica may return LSN from future and keep staying in present.
1777-
* Or it can return LSN with NullXRecOff.
1778-
*
1779-
* That's bad, since we want to get real LSN to save it in backup label file
1780-
* and to use it in WAL validation.
1781-
*
1782-
* So we try to do the following:
1783-
* 1. Wait 'archive_timeout' seconds for segment containing stop_lsn and
1784-
* look for the first valid record in it.
1785-
* It solves the problem of occasional invalid XRecOff on write-busy system.
1786-
* 2. Failing that, look for record in previous segment with endpoint
1787-
* equal or greater than stop_lsn. It may(!) solve the problem of NullXRecOff
1788-
* on write-idle system. If that fails too, error out.
1789-
*/
1769+
/*
1770+
* Note, that there is no guarantee that corresponding WAL file even exists.
1771+
* Replica may return LSN from future and keep staying in present.
1772+
* Or it can return invalid LSN.
1773+
*
1774+
* That's bad, since we want to get real LSN to save it in backup label file
1775+
* and to use it in WAL validation.
1776+
*
1777+
* So we try to do the following:
1778+
* 1. Wait 'archive_timeout' seconds for segment containing stop_lsn and
1779+
* look for the first valid record in it.
1780+
* It solves the problem of occasional invalid LSN on write-busy system.
1781+
* 2. Failing that, look for record in previous segment with endpoint
1782+
* equal or greater than stop_lsn. It may(!) solve the problem of invalid LSN
1783+
* on write-idle system. If that fails too, error out.
1784+
*/
17901785

1786+
/* stop_lsn is pointing to a 0 byte of xlog segment */
1787+
if (stop_backup_lsn_tmp % instance_config.xlog_seg_size == 0)
1788+
{
17911789
/* Wait for segment with current stop_lsn, it is ok for it to never arrive */
17921790
wait_wal_lsn(stop_backup_lsn_tmp, false, backup->tli,
1793-
false, true, WARNING, stream_wal);
1791+
false, true, WARNING, stream_wal);
17941792

17951793
/* Get the first record in segment with current stop_lsn */
17961794
lsn_tmp = get_first_record_lsn(xlog_path, segno, backup->tli,
@@ -1826,17 +1824,39 @@ pg_stop_backup(pgBackup *backup, PGconn *pg_startbackup_conn,
18261824
(uint32) (stop_backup_lsn_tmp >> 32),
18271825
(uint32) (stop_backup_lsn_tmp));
18281826
}
1827+
}
1828+
/* stop lsn is aligned to xlog block size, just find next lsn */
1829+
else if (stop_backup_lsn_tmp % XLOG_BLCKSZ == 0)
1830+
{
1831+
/* Wait for segment with current stop_lsn */
1832+
wait_wal_lsn(stop_backup_lsn_tmp, false, backup->tli,
1833+
false, true, ERROR, stream_wal);
1834+
1835+
/* Get the next closest record in segment with current stop_lsn */
1836+
lsn_tmp = get_next_record_lsn(xlog_path, segno, backup->tli,
1837+
instance_config.xlog_seg_size,
1838+
instance_config.archive_timeout,
1839+
stop_backup_lsn_tmp);
18291840

1830-
/* Setting stop_backup_lsn will set stop point for streaming */
1831-
stop_backup_lsn = lsn_tmp;
1832-
stop_lsn_exists = true;
1841+
/* sanity */
1842+
if (!XRecOffIsValid(lsn_tmp) || XLogRecPtrIsInvalid(lsn_tmp))
1843+
elog(ERROR, "Failed to get WAL record next to %X/%X",
1844+
(uint32) (stop_backup_lsn_tmp >> 32),
1845+
(uint32) (stop_backup_lsn_tmp));
18331846
}
18341847
/* PostgreSQL returned something very illegal as STOP_LSN, error out */
18351848
else
18361849
elog(ERROR, "Invalid stop_backup_lsn value %X/%X",
18371850
(uint32) (stop_backup_lsn_tmp >> 32), (uint32) (stop_backup_lsn_tmp));
1851+
1852+
/* Setting stop_backup_lsn will set stop point for streaming */
1853+
stop_backup_lsn = lsn_tmp;
1854+
stop_lsn_exists = true;
18381855
}
18391856

1857+
elog(LOG, "stop_lsn: %X/%X",
1858+
(uint32) (stop_backup_lsn >> 32), (uint32) (stop_backup_lsn));
1859+
18401860
/* Write backup_label and tablespace_map */
18411861
if (!exclusive_backup)
18421862
{

src/parsexlog.c

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -680,6 +680,97 @@ get_first_record_lsn(const char *archivedir, XLogSegNo segno,
680680
return record;
681681
}
682682

683+
684+
/*
685+
* Get LSN of the record next after target lsn.
686+
*/
687+
XLogRecPtr
688+
get_next_record_lsn(const char *archivedir, XLogSegNo segno,
689+
TimeLineID tli, uint32 wal_seg_size, int timeout,
690+
XLogRecPtr target)
691+
{
692+
XLogReaderState *xlogreader;
693+
XLogReaderData reader_data;
694+
XLogRecPtr startpoint, found, res;
695+
char wal_segment[MAXFNAMELEN];
696+
int attempts = 0;
697+
698+
if (segno <= 1)
699+
elog(ERROR, "Invalid WAL segment number " UINT64_FORMAT, segno);
700+
701+
GetXLogFileName(wal_segment, tli, segno, instance_config.xlog_seg_size);
702+
703+
xlogreader = InitXLogPageRead(&reader_data, archivedir, tli, wal_seg_size,
704+
false, false, true);
705+
if (xlogreader == NULL)
706+
elog(ERROR, "Out of memory");
707+
xlogreader->system_identifier = instance_config.system_identifier;
708+
709+
/* Set startpoint to 0 in segno */
710+
GetXLogRecPtr(segno, 0, wal_seg_size, startpoint);
711+
712+
found = XLogFindNextRecord(xlogreader, startpoint);
713+
714+
if (XLogRecPtrIsInvalid(found))
715+
{
716+
if (xlogreader->errormsg_buf[0] != '\0')
717+
elog(WARNING, "Could not read WAL record at %X/%X: %s",
718+
(uint32) (startpoint >> 32), (uint32) (startpoint),
719+
xlogreader->errormsg_buf);
720+
else
721+
elog(WARNING, "Could not read WAL record at %X/%X",
722+
(uint32) (startpoint >> 32), (uint32) (startpoint));
723+
PrintXLogCorruptionMsg(&reader_data, ERROR);
724+
}
725+
startpoint = found;
726+
727+
while (attempts <= timeout)
728+
{
729+
XLogRecord *record;
730+
char *errormsg;
731+
732+
if (interrupted)
733+
elog(ERROR, "Interrupted during WAL reading");
734+
735+
record = XLogReadRecord(xlogreader, startpoint, &errormsg);
736+
737+
if (record == NULL)
738+
{
739+
XLogRecPtr errptr;
740+
741+
errptr = XLogRecPtrIsInvalid(startpoint) ? xlogreader->EndRecPtr :
742+
startpoint;
743+
744+
if (errormsg)
745+
elog(WARNING, "Could not read WAL record at %X/%X: %s",
746+
(uint32) (errptr >> 32), (uint32) (errptr),
747+
errormsg);
748+
else
749+
elog(WARNING, "Could not read WAL record at %X/%X",
750+
(uint32) (errptr >> 32), (uint32) (errptr));
751+
PrintXLogCorruptionMsg(&reader_data, ERROR);
752+
}
753+
754+
if (xlogreader->ReadRecPtr >= target)
755+
{
756+
elog(LOG, "Record %X/%X is next after target LSN %X/%X",
757+
(uint32) (xlogreader->ReadRecPtr >> 32), (uint32) (xlogreader->ReadRecPtr),
758+
(uint32) (target >> 32), (uint32) (target));
759+
res = xlogreader->ReadRecPtr;
760+
break;
761+
}
762+
else
763+
startpoint = InvalidXLogRecPtr;
764+
}
765+
766+
/* cleanup */
767+
CleanupXLogPageRead(xlogreader);
768+
XLogReaderFree(xlogreader);
769+
770+
return res;
771+
}
772+
773+
683774
/*
684775
* Get LSN of a record prior to target_lsn.
685776
* If 'start_lsn' is in the segment with number 'segno' then start from 'start_lsn',

src/pg_probackup.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -997,6 +997,8 @@ extern XLogRecPtr get_prior_record_lsn(const char *archivedir, XLogRecPtr start_
997997

998998
extern XLogRecPtr get_first_record_lsn(const char *archivedir, XLogRecPtr start_lsn,
999999
TimeLineID tli, uint32 wal_seg_size, int timeout);
1000+
extern XLogRecPtr get_next_record_lsn(const char *archivedir, XLogSegNo segno, TimeLineID tli,
1001+
uint32 wal_seg_size, int timeout, XLogRecPtr target);
10001002

10011003
/* in util.c */
10021004
extern TimeLineID get_current_timeline(PGconn *conn);

tests/replica.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -606,7 +606,7 @@ def test_replica_stop_lsn_null_offset(self):
606606
return_id=False)
607607

608608
self.assertIn(
609-
'LOG: Null offset in stop_backup_lsn value 0/4000000',
609+
'LOG: Invalid offset in stop_lsn value 0/4000000',
610610
output)
611611

612612
self.assertIn(
@@ -714,7 +714,7 @@ def test_replica_stop_lsn_null_offset_next_record(self):
714714
log_content = f.read()
715715

716716
self.assertIn(
717-
'LOG: Null offset in stop_backup_lsn value 0/4000000',
717+
'LOG: Invalid offset in stop_lsn value 0/4000000',
718718
log_content)
719719

720720
self.assertIn(
@@ -787,7 +787,7 @@ def test_archive_replica_null_offset(self):
787787
return_id=False)
788788

789789
self.assertIn(
790-
'LOG: Null offset in stop_backup_lsn value 0/4000000',
790+
'LOG: Invalid offset in stop_lsn value 0/4000000',
791791
output)
792792

793793
self.assertIn(

0 commit comments

Comments
 (0)