Permalink
Browse files

Allow I/O reliability checks using 16-bit checksums

Checksums are set immediately prior to flush out of shared buffers
and checked when pages are read in again. Hint bit setting will
require full page write when block is dirtied, which causes various
infrastructure changes. Extensive comments, docs and README.

WARNING message thrown if checksum fails on non-all zeroes page;
ERROR thrown but can be disabled with ignore_checksum_failure = on.

Feature enabled by an initdb option, since transition from option off
to option on is long and complex and has not yet been implemented.
Default is not to use checksums.

Checksum used is WAL CRC-32 truncated to 16-bits.

Simon Riggs, Jeff Davis, Greg Smith
Wide input and assistance from many community members. Thank you.
  • Loading branch information...
1 parent e4a05c7 commit 96ef3b8ff1cf1950e897fd2f766d4bd9ef0d5d56 @simonat2ndQuadrant simonat2ndQuadrant committed Mar 22, 2013
Showing with 766 additions and 146 deletions.
  1. +31 −1 contrib/pg_upgrade/controldata.c
  2. +1 −0 contrib/pg_upgrade/pg_upgrade.h
  3. +24 −0 doc/src/sgml/config.sgml
  4. +14 −0 doc/src/sgml/ref/initdb.sgml
  5. +6 −2 src/backend/access/gist/gistget.c
  6. +2 −4 src/backend/access/hash/hash.c
  7. +64 −32 src/backend/access/heap/heapam.c
  8. +1 −1 src/backend/access/heap/pruneheap.c
  9. +4 −0 src/backend/access/heap/rewriteheap.c
  10. +38 −10 src/backend/access/heap/visibilitymap.c
  11. +7 −3 src/backend/access/nbtree/nbtinsert.c
  12. +2 −1 src/backend/access/nbtree/nbtree.c
  13. +3 −0 src/backend/access/nbtree/nbtsort.c
  14. +2 −4 src/backend/access/nbtree/nbtutils.c
  15. +4 −0 src/backend/access/rmgrdesc/xlogdesc.c
  16. +3 −0 src/backend/access/spgist/spginsert.c
  17. +22 −0 src/backend/access/transam/README
  18. +107 −4 src/backend/access/transam/xlog.c
  19. +6 −1 src/backend/bootstrap/bootstrap.c
  20. +2 −0 src/backend/commands/matview.c
  21. +1 −1 src/backend/commands/sequence.c
  22. +2 −0 src/backend/commands/tablecmds.c
  23. +7 −7 src/backend/commands/vacuumlazy.c
  24. +175 −39 src/backend/storage/buffer/bufmgr.c
  25. +5 −2 src/backend/storage/buffer/localbuf.c
  26. +3 −1 src/backend/storage/freespace/README
  27. +6 −4 src/backend/storage/freespace/freespace.c
  28. +1 −1 src/backend/storage/freespace/fsmpage.c
  29. +167 −16 src/backend/storage/page/bufpage.c
  30. +16 −0 src/backend/utils/misc/guc.c
  31. +2 −2 src/backend/utils/time/tqual.c
  32. +16 −3 src/bin/initdb/initdb.c
  33. +2 −0 src/bin/pg_controldata/pg_controldata.c
  34. +2 −0 src/bin/pg_resetxlog/pg_resetxlog.c
  35. +1 −1 src/include/access/heapam_xlog.h
  36. +2 −2 src/include/access/visibilitymap.h
  37. +3 −0 src/include/access/xlog.h
  38. +6 −2 src/include/catalog/pg_control.h
  39. +2 −1 src/include/storage/bufmgr.h
  40. +4 −1 src/include/storage/bufpage.h
@@ -56,6 +56,7 @@ get_control_data(ClusterInfo *cluster, bool live_check)
bool got_toast = false;
bool got_date_is_int = false;
bool got_float8_pass_by_value = false;
+ bool got_data_checksums = false;
char *lc_collate = NULL;
char *lc_ctype = NULL;
char *lc_monetary = NULL;
@@ -131,6 +132,13 @@ get_control_data(ClusterInfo *cluster, bool live_check)
got_float8_pass_by_value = true;
}
+ /* Only in <= 9.2 */
+ if (GET_MAJOR_VERSION(cluster->major_version) <= 902)
+ {
+ cluster->controldata.data_checksums = false;
+ got_data_checksums = true;
+ }
+
/* we have the result of cmd in "output". so parse it line by line now */
while (fgets(bufin, sizeof(bufin), output))
{
@@ -393,6 +401,18 @@ get_control_data(ClusterInfo *cluster, bool live_check)
cluster->controldata.float8_pass_by_value = strstr(p, "by value") != NULL;
got_float8_pass_by_value = true;
}
+ else if ((p = strstr(bufin, "checksums")) != NULL)
+ {
+ p = strchr(p, ':');
+
+ if (p == NULL || strlen(p) <= 1)
+ pg_log(PG_FATAL, "%d: controldata retrieval problem\n", __LINE__);
+
+ p++; /* removing ':' char */
+ /* used later for contrib check */
+ cluster->controldata.data_checksums = strstr(p, "enabled") != NULL;
+ got_data_checksums = true;
+ }
/* In pre-8.4 only */
else if ((p = strstr(bufin, "LC_COLLATE:")) != NULL)
{
@@ -476,7 +496,7 @@ get_control_data(ClusterInfo *cluster, bool live_check)
!got_tli ||
!got_align || !got_blocksz || !got_largesz || !got_walsz ||
!got_walseg || !got_ident || !got_index || !got_toast ||
- !got_date_is_int || !got_float8_pass_by_value)
+ !got_date_is_int || !got_float8_pass_by_value || !got_data_checksums)
{
pg_log(PG_REPORT,
"The %s cluster lacks some required control information:\n",
@@ -535,6 +555,10 @@ get_control_data(ClusterInfo *cluster, bool live_check)
if (!got_float8_pass_by_value)
pg_log(PG_REPORT, " float8 argument passing method\n");
+ /* value added in Postgres 9.3 */
+ if (!got_data_checksums)
+ pg_log(PG_REPORT, " data checksums\n");
+
pg_log(PG_FATAL,
"Cannot continue without required control information, terminating\n");
}
@@ -596,6 +620,12 @@ check_control_data(ControlData *oldctrl,
"--disable-integer-datetimes or get server binaries built with those\n"
"options.\n");
}
+
+ if (oldctrl->data_checksums != newctrl->data_checksums)
+ {
+ pg_log(PG_FATAL,
+ "old and new pg_controldata checksums settings are invalid or do not match\n");
+ }
}
@@ -202,6 +202,7 @@ typedef struct
uint32 toast;
bool date_is_int;
bool float8_pass_by_value;
+ bool data_checksums;
char *lc_collate;
char *lc_ctype;
char *encoding;
View
@@ -6629,6 +6629,30 @@ LOG: CleanUpLock: deleting: lock(0xb7acd844) id(24688,24696,0,0,0,1)
</listitem>
</varlistentry>
+ <varlistentry id="guc-ignore-checksum-failure" xreflabel="ignore_checksum_failure">
+ <term><varname>ignore_checksum_failure</varname> (<type>boolean</type>)</term>
+ <indexterm>
+ <primary><varname>ignore_checksum_failure</> configuration parameter</primary>
+ </indexterm>
+ <listitem>
+ <para>
+ Only has effect if <xref linkend="app-initdb-data-checksums"> are enabled.
+ </para>
+ <para>
+ Detection of a checksum failure during a read normally causes
+ <productname>PostgreSQL</> to report an error, aborting the current
+ transaction. Setting <varname>ignore_checksum_failure</> to on causes
+ the system to ignore the failure (but still report a warning), and
+ continue processing. This behavior may <emphasis>cause crashes, propagate
+ or hide corruption, or other serious problems</>. However, it may allow
+ you to get past the error and retrieve undamaged tuples that might still be
+ present in the table if the block header is still sane. If the header is
+ corrupt an error will be reported even if this option is enabled. The
+ default setting is <literal>off</>, and it can only be changed by a superuser.
+ </para>
+ </listitem>
+ </varlistentry>
+
<varlistentry id="guc-zero-damaged-pages" xreflabel="zero_damaged_pages">
<term><varname>zero_damaged_pages</varname> (<type>boolean</type>)</term>
<indexterm>
@@ -182,6 +182,20 @@ PostgreSQL documentation
</listitem>
</varlistentry>
+ <varlistentry id="app-initdb-data-checksums" xreflabel="data checksums">
+ <term><option>-k</option></term>
+ <term><option>--data-checksums</option></term>
+ <listitem>
+ <para>
+ Use checksums on data pages to help detect corruption by the
+ I/O system that would otherwise be silent. Enabling checksums
+ may incur a noticeable performance penalty. This option can only
+ be set during initialization, and cannot be changed later. If
+ set, checksums are calculated for all objects, in all databases.
+ </para>
+ </listitem>
+ </varlistentry>
+
<varlistentry>
<term><option>--locale=<replaceable>locale</replaceable></option></term>
<listitem>
@@ -362,8 +362,12 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances,
{
/* Creating index-page GISTSearchItem */
item->blkno = ItemPointerGetBlockNumber(&it->t_tid);
- /* lsn of current page is lsn of parent page for child */
- item->data.parentlsn = PageGetLSN(page);
+
+ /*
+ * LSN of current page is lsn of parent page for child. We only
+ * have a shared lock, so we need to get the LSN atomically.
+ */
+ item->data.parentlsn = BufferGetLSNAtomic(buffer);
}
/* Insert it into the queue using new distance data */
@@ -285,11 +285,9 @@ hashgettuple(PG_FUNCTION_ARGS)
ItemIdMarkDead(PageGetItemId(page, offnum));
/*
- * Since this can be redone later if needed, it's treated the same
- * as a commit-hint-bit status update for heap tuples: we mark the
- * buffer dirty but don't make a WAL log entry.
+ * Since this can be redone later if needed, mark as a hint.
*/
- SetBufferCommitInfoNeedsSave(buf);
+ MarkBufferDirtyHint(buf);
}
/*
@@ -5754,17 +5754,23 @@ log_heap_freeze(Relation reln, Buffer buffer,
* being marked all-visible, and vm_buffer is the buffer containing the
* corresponding visibility map block. Both should have already been modified
* and dirtied.
+ *
+ * If checksums are enabled, we also add the heap_buffer to the chain to
+ * protect it from being torn.
*/
XLogRecPtr
-log_heap_visible(RelFileNode rnode, BlockNumber block, Buffer vm_buffer,
+log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer,
TransactionId cutoff_xid)
{
xl_heap_visible xlrec;
XLogRecPtr recptr;
- XLogRecData rdata[2];
+ XLogRecData rdata[3];
+
+ Assert(BufferIsValid(heap_buffer));
+ Assert(BufferIsValid(vm_buffer));
xlrec.node = rnode;
- xlrec.block = block;
+ xlrec.block = BufferGetBlockNumber(heap_buffer);
xlrec.cutoff_xid = cutoff_xid;
rdata[0].data = (char *) &xlrec;
@@ -5778,6 +5784,17 @@ log_heap_visible(RelFileNode rnode, BlockNumber block, Buffer vm_buffer,
rdata[1].buffer_std = false;
rdata[1].next = NULL;
+ if (DataChecksumsEnabled())
+ {
+ rdata[1].next = &(rdata[2]);
+
+ rdata[2].data = NULL;
+ rdata[2].len = 0;
+ rdata[2].buffer = heap_buffer;
+ rdata[2].buffer_std = true;
+ rdata[2].next = NULL;
+ }
+
recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VISIBLE, rdata);
return recptr;
@@ -6139,8 +6156,6 @@ static void
heap_xlog_visible(XLogRecPtr lsn, XLogRecord *record)
{
xl_heap_visible *xlrec = (xl_heap_visible *) XLogRecGetData(record);
- Buffer buffer;
- Page page;
/*
* If there are any Hot Standby transactions running that have an xmin
@@ -6155,39 +6170,56 @@ heap_xlog_visible(XLogRecPtr lsn, XLogRecord *record)
ResolveRecoveryConflictWithSnapshot(xlrec->cutoff_xid, xlrec->node);
/*
- * Read the heap page, if it still exists. If the heap file has been
- * dropped or truncated later in recovery, we don't need to update the
- * page, but we'd better still update the visibility map.
+ * If heap block was backed up, restore it. This can only happen with
+ * checksums enabled.
*/
- buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, xlrec->block,
- RBM_NORMAL);
- if (BufferIsValid(buffer))
+ if (record->xl_info & XLR_BKP_BLOCK(1))
{
- LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
-
- page = (Page) BufferGetPage(buffer);
+ Assert(DataChecksumsEnabled());
+ (void) RestoreBackupBlock(lsn, record, 1, false, false);
+ }
+ else
+ {
+ Buffer buffer;
+ Page page;
/*
- * We don't bump the LSN of the heap page when setting the visibility
- * map bit, because that would generate an unworkable volume of
- * full-page writes. This exposes us to torn page hazards, but since
- * we're not inspecting the existing page contents in any way, we
- * don't care.
- *
- * However, all operations that clear the visibility map bit *do* bump
- * the LSN, and those operations will only be replayed if the XLOG LSN
- * follows the page LSN. Thus, if the page LSN has advanced past our
- * XLOG record's LSN, we mustn't mark the page all-visible, because
- * the subsequent update won't be replayed to clear the flag.
+ * Read the heap page, if it still exists. If the heap file has been
+ * dropped or truncated later in recovery, we don't need to update the
+ * page, but we'd better still update the visibility map.
*/
- if (lsn > PageGetLSN(page))
+ buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM,
+ xlrec->block, RBM_NORMAL);
+ if (BufferIsValid(buffer))
{
- PageSetAllVisible(page);
- MarkBufferDirty(buffer);
- }
+ LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
- /* Done with heap page. */
- UnlockReleaseBuffer(buffer);
+ page = (Page) BufferGetPage(buffer);
+
+ /*
+ * We don't bump the LSN of the heap page when setting the
+ * visibility map bit (unless checksums are enabled, in which case
+ * we must), because that would generate an unworkable volume of
+ * full-page writes. This exposes us to torn page hazards, but
+ * since we're not inspecting the existing page contents in any
+ * way, we don't care.
+ *
+ * However, all operations that clear the visibility map bit *do*
+ * bump the LSN, and those operations will only be replayed if the
+ * XLOG LSN follows the page LSN. Thus, if the page LSN has
+ * advanced past our XLOG record's LSN, we mustn't mark the page
+ * all-visible, because the subsequent update won't be replayed to
+ * clear the flag.
+ */
+ if (lsn > PageGetLSN(page))
+ {
+ PageSetAllVisible(page);
+ MarkBufferDirty(buffer);
+ }
+
+ /* Done with heap page. */
+ UnlockReleaseBuffer(buffer);
+ }
}
/*
@@ -6218,7 +6250,7 @@ heap_xlog_visible(XLogRecPtr lsn, XLogRecord *record)
* real harm is done; and the next VACUUM will fix it.
*/
if (lsn > PageGetLSN(BufferGetPage(vmbuffer)))
- visibilitymap_set(reln, xlrec->block, lsn, vmbuffer,
+ visibilitymap_set(reln, xlrec->block, InvalidBuffer, lsn, vmbuffer,
xlrec->cutoff_xid);
ReleaseBuffer(vmbuffer);
@@ -262,7 +262,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
{
((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
PageClearFull(page);
- SetBufferCommitInfoNeedsSave(buffer);
+ MarkBufferDirtyHint(buffer);
}
}
@@ -273,6 +273,8 @@ end_heap_rewrite(RewriteState state)
/* Write the last page, if any */
if (state->rs_buffer_valid)
{
+ PageSetChecksumInplace(state->rs_buffer, state->rs_blockno);
+
if (state->rs_use_wal)
log_newpage(&state->rs_new_rel->rd_node,
MAIN_FORKNUM,
@@ -614,6 +616,8 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
{
/* Doesn't fit, so write out the existing page */
+ PageSetChecksumInplace(page, state->rs_blockno);
+
/* XLOG stuff */
if (state->rs_use_wal)
log_newpage(&state->rs_new_rel->rd_node,
Oops, something went wrong.

0 comments on commit 96ef3b8

Please sign in to comment.