Skip to content

Commit

Permalink
pg_upgrade: Convert old visibility map format to new format.
Browse files Browse the repository at this point in the history
Commit a892234 added a second bit per
page to the visibility map, but pg_upgrade has been unaware of it up
until now.  Therefore, a pg_upgrade from an earlier major release of
PostgreSQL to any commit preceding this one and following the one
mentioned above would result in invalid visibility map contents on the
new cluster, very possibly leading to data corruption.  This plugs
that hole.

Masahiko Sawada, reviewed by Jeff Janes, Bruce Momjian, Simon Riggs,
Michael Paquier, Andres Freund, me, and others.
  • Loading branch information
robertmhaas committed Mar 11, 2016
1 parent 9118d03 commit 7087166
Show file tree
Hide file tree
Showing 3 changed files with 197 additions and 11 deletions.
154 changes: 154 additions & 0 deletions src/bin/pg_upgrade/file.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,16 @@

#include "postgres_fe.h"

#include "access/visibilitymap.h"
#include "pg_upgrade.h"
#include "storage/bufpage.h"
#include "storage/checksum.h"
#include "storage/checksum_impl.h"

#include <sys/stat.h>
#include <fcntl.h>

#define BITS_PER_HEAPBLOCK_OLD 1


#ifndef WIN32
Expand Down Expand Up @@ -138,6 +144,154 @@ copy_file(const char *srcfile, const char *dstfile, bool force)
#endif


/*
* rewriteVisibilityMap()
*
* In versions of PostgreSQL prior to catversion 201603011, PostgreSQL's
* visibility map included one bit per heap page; it now includes two.
* When upgrading a cluster from before that time to a current PostgreSQL
* version, we could refuse to copy visibility maps from the old cluster
* to the new cluster; the next VACUUM would recreate them, but at the
* price of scanning the entire table. So, instead, we rewrite the old
* visibility maps in the new format. That way, the all-visible bit
* remains set for the pages for which it was set previously. The
* all-frozen bit is never set by this conversion; we leave that to
* VACUUM.
*/
const char *
rewriteVisibilityMap(const char *fromfile, const char *tofile, bool force)
{
int src_fd = 0;
int dst_fd = 0;
char buffer[BLCKSZ];
ssize_t bytesRead;
ssize_t src_filesize;
int rewriteVmBytesPerPage;
BlockNumber new_blkno = 0;
struct stat statbuf;

/* Compute we need how many old page bytes to rewrite a new page */
rewriteVmBytesPerPage = (BLCKSZ - SizeOfPageHeaderData) / 2;

if ((fromfile == NULL) || (tofile == NULL))
return "Invalid old file or new file";

if ((src_fd = open(fromfile, O_RDONLY, 0)) < 0)
return getErrorText();

if (fstat(src_fd, &statbuf) != 0)
{
close(src_fd);
return getErrorText();
}

if ((dst_fd = open(tofile, O_RDWR | O_CREAT | (force ? 0 : O_EXCL), S_IRUSR | S_IWUSR)) < 0)
{
close(src_fd);
return getErrorText();
}

/* Save old file size */
src_filesize = statbuf.st_size;

/*
* Turn each visibility map page into 2 pages one by one. Each new page
* has the same page header as the old one. If the last section of last
* page is empty, we skip it, mostly to avoid turning one-page visibility
* maps for small relations into two pages needlessly.
*/
while ((bytesRead = read(src_fd, buffer, BLCKSZ)) == BLCKSZ)
{
char *old_cur;
char *old_break;
char *old_blkend;
PageHeaderData pageheader;
bool old_lastblk = ((BLCKSZ * (new_blkno + 1)) == src_filesize);

/* Save the page header data */
memcpy(&pageheader, buffer, SizeOfPageHeaderData);

/*
* These old_* variables point to old visibility map page. old_cur
* points to current position on old page. old_blkend points to end of
* old block. old_break points to old page break position for
* rewriting a new page. After wrote a new page, old_break proceeds
* rewriteVmBytesPerPage bytes.
*/
old_cur = buffer + SizeOfPageHeaderData;
old_blkend = buffer + bytesRead;
old_break = old_cur + rewriteVmBytesPerPage;

while (old_blkend >= old_break)
{
char new_vmbuf[BLCKSZ];
char *new_cur = new_vmbuf;
bool empty = true;
bool old_lastpart;

/* Copy page header in advance */
memcpy(new_vmbuf, &pageheader, SizeOfPageHeaderData);

/* Rewrite the last part of the old page? */
old_lastpart = old_lastblk && (old_blkend == old_break);

new_cur += SizeOfPageHeaderData;

/* Process old page bytes one by one, and turn it into new page. */
while (old_break > old_cur)
{
uint16 new_vmbits = 0;
int i;

/* Generate new format bits while keeping old information */
for (i = 0; i < BITS_PER_BYTE; i++)
{
uint8 byte = *(uint8 *) old_cur;

if (byte & (1 << (BITS_PER_HEAPBLOCK_OLD * i)))
{
empty = false;
new_vmbits |= 1 << (BITS_PER_HEAPBLOCK * i);
}
}

/* Copy new visibility map bit to new format page */
memcpy(new_cur, &new_vmbits, BITS_PER_HEAPBLOCK);

old_cur += BITS_PER_HEAPBLOCK_OLD;
new_cur += BITS_PER_HEAPBLOCK;
}

/* If the last part of the old page is empty, skip to write it */
if (old_lastpart && empty)
break;

/* Set new checksum for a visibility map page (if enabled) */
if (old_cluster.controldata.data_checksum_version != 0 &&
new_cluster.controldata.data_checksum_version != 0)
((PageHeader) new_vmbuf)->pd_checksum =
pg_checksum_page(new_vmbuf, new_blkno);

if (write(dst_fd, new_vmbuf, BLCKSZ) != BLCKSZ)
{
close(dst_fd);
close(src_fd);
return getErrorText();
}

old_break += rewriteVmBytesPerPage;
new_blkno++;
}
}

/* Close files */
close(dst_fd);
close(src_fd);

return NULL;

}

void
check_hard_link(void)
{
Expand Down
6 changes: 6 additions & 0 deletions src/bin/pg_upgrade/pg_upgrade.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,10 @@ extern char *output_files[];
*/
#define VISIBILITY_MAP_CRASHSAFE_CAT_VER 201107031

/*
* The format of visibility map is changed with this 9.6 commit,
*/
#define VISIBILITY_MAP_FROZEN_BIT_CAT_VER 201603011
/*
* pg_multixact format changed in 9.3 commit 0ac5ad5134f2769ccbaefec73844f85,
* ("Improve concurrency of foreign key locking") which also updated catalog
Expand Down Expand Up @@ -365,6 +369,8 @@ bool pid_lock_file_exists(const char *datadir);

const char *copyFile(const char *src, const char *dst, bool force);
const char *linkFile(const char *src, const char *dst);
const char *rewriteVisibilityMap(const char *fromfile, const char *tofile,
bool force);

void check_hard_link(void);
FILE *fopen_priv(const char *path, const char *mode);
Expand Down
48 changes: 37 additions & 11 deletions src/bin/pg_upgrade/relfilenode.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@

#include "pg_upgrade.h"

#include <sys/stat.h>
#include "catalog/pg_class.h"
#include "access/transam.h"


static void transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace);
static void transfer_relfile(FileNameMap *map, const char *suffix);
static void transfer_relfile(FileNameMap *map, const char *suffix, bool vm_must_add_frozenbit);


/*
Expand Down Expand Up @@ -132,6 +133,7 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace)
{
int mapnum;
bool vm_crashsafe_match = true;
bool vm_must_add_frozenbit = false;

/*
* Do the old and new cluster disagree on the crash-safetiness of the vm
Expand All @@ -141,23 +143,30 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace)
new_cluster.controldata.cat_ver >= VISIBILITY_MAP_CRASHSAFE_CAT_VER)
vm_crashsafe_match = false;

/*
* Do we need to rewrite visibilitymap?
*/
if (old_cluster.controldata.cat_ver < VISIBILITY_MAP_FROZEN_BIT_CAT_VER &&
new_cluster.controldata.cat_ver >= VISIBILITY_MAP_FROZEN_BIT_CAT_VER)
vm_must_add_frozenbit = true;

for (mapnum = 0; mapnum < size; mapnum++)
{
if (old_tablespace == NULL ||
strcmp(maps[mapnum].old_tablespace, old_tablespace) == 0)
{
/* transfer primary file */
transfer_relfile(&maps[mapnum], "");
transfer_relfile(&maps[mapnum], "", vm_must_add_frozenbit);

/* fsm/vm files added in PG 8.4 */
if (GET_MAJOR_VERSION(old_cluster.major_version) >= 804)
{
/*
* Copy/link any fsm and vm files, if they exist
*/
transfer_relfile(&maps[mapnum], "_fsm");
transfer_relfile(&maps[mapnum], "_fsm", vm_must_add_frozenbit);
if (vm_crashsafe_match)
transfer_relfile(&maps[mapnum], "_vm");
transfer_relfile(&maps[mapnum], "_vm", vm_must_add_frozenbit);
}
}
}
Expand All @@ -167,17 +176,19 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace)
/*
* transfer_relfile()
*
* Copy or link file from old cluster to new one.
* Copy or link file from old cluster to new one. If vm_must_add_frozenbit
* is true, visibility map forks are converted and rewritten, even in link
* mode.
*/
static void
transfer_relfile(FileNameMap *map, const char *type_suffix)
transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_frozenbit)
{
const char *msg;
char old_file[MAXPGPATH];
char new_file[MAXPGPATH];
int fd;
int segno;
char extent_suffix[65];
struct stat statbuf;

/*
* Now copy/link any related segments as well. Remember, PG breaks large
Expand Down Expand Up @@ -210,7 +221,7 @@ transfer_relfile(FileNameMap *map, const char *type_suffix)
if (type_suffix[0] != '\0' || segno != 0)
{
/* Did file open fail? */
if ((fd = open(old_file, O_RDONLY, 0)) == -1)
if (stat(old_file, &statbuf) != 0)
{
/* File does not exist? That's OK, just return */
if (errno == ENOENT)
Expand All @@ -220,7 +231,10 @@ transfer_relfile(FileNameMap *map, const char *type_suffix)
map->nspname, map->relname, old_file, new_file,
getErrorText());
}
close(fd);

/* If file is empty, just return */
if (statbuf.st_size == 0)
return;
}

unlink(new_file);
Expand All @@ -232,15 +246,27 @@ transfer_relfile(FileNameMap *map, const char *type_suffix)
{
pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\"\n", old_file, new_file);

if ((msg = copyFile(old_file, new_file, true)) != NULL)
/* Rewrite visibility map if needed */
if (vm_must_add_frozenbit && (strcmp(type_suffix, "_vm") == 0))
msg = rewriteVisibilityMap(old_file, new_file, true);
else
msg = copyFile(old_file, new_file, true);

if (msg)
pg_fatal("error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n",
map->nspname, map->relname, old_file, new_file, msg);
}
else
{
pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"\n", old_file, new_file);

if ((msg = linkFile(old_file, new_file)) != NULL)
/* Rewrite visibility map if needed */
if (vm_must_add_frozenbit && (strcmp(type_suffix, "_vm") == 0))
msg = rewriteVisibilityMap(old_file, new_file, true);
else
msg = linkFile(old_file, new_file);

if (msg)
pg_fatal("error while creating link for relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n",
map->nspname, map->relname, old_file, new_file, msg);
}
Expand Down

0 comments on commit 7087166

Please sign in to comment.