Skip to content

Commit

Permalink
Force checkpoint before rewind
Browse files Browse the repository at this point in the history
Prior to running pg_rewind, force a checkpoint on the source database to
ensure the pg_control file used by pg_rewind is up-to-date. Failure to
do so can result in pg_rewind not detecting a timeline fork and exiting
with no action, leading stolon to fallback on pg_basebackup (which
itself will checkpoint) to ensure a resync is successful.
  • Loading branch information
lawrencejones committed May 17, 2019
1 parent a27bcae commit 487d25b
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 2 deletions.
3 changes: 2 additions & 1 deletion cmd/keeper/cmd/keeper.go
Original file line number Diff line number Diff line change
Expand Up @@ -811,7 +811,8 @@ func (p *PostgresKeeper) resync(db, followedDB *cluster.DB, tryPgrewind bool) er
if tryPgrewind && p.usePgrewind(db) {
connParams := p.getSUConnParams(db, followedDB)
log.Infow("syncing using pg_rewind", "followedDB", followedDB.UID, "keeper", followedDB.Spec.KeeperUID)
if err := pgm.SyncFromFollowedPGRewind(connParams, p.pgSUPassword); err != nil {
// TODO: Make the forceCheckpoint parameter use cluster specification
if err := pgm.SyncFromFollowedPGRewind(connParams, p.pgSUPassword, true); err != nil {
// log pg_rewind error and fallback to pg_basebackup
log.Errorw("error syncing with pg_rewind", zap.Error(err))
} else {
Expand Down
28 changes: 27 additions & 1 deletion internal/postgresql/postgresql.go
Original file line number Diff line number Diff line change
Expand Up @@ -754,7 +754,7 @@ func (p *Manager) createPostgresqlAutoConf() error {
return nil
}

func (p *Manager) SyncFromFollowedPGRewind(followedConnParams ConnParams, password string) error {
func (p *Manager) SyncFromFollowedPGRewind(followedConnParams ConnParams, password string, forceCheckpoint bool) error {
// Remove postgresql.auto.conf since pg_rewind will error if it's a symlink to /dev/null
pgAutoConfPath := filepath.Join(p.dataDir, postgresAutoConf)
if err := os.Remove(pgAutoConfPath); err != nil && !os.IsNotExist(err) {
Expand All @@ -780,6 +780,32 @@ func (p *Manager) SyncFromFollowedPGRewind(followedConnParams ConnParams, passwo
followedConnParams.Set("options", "-c synchronous_commit=off")
followedConnString := followedConnParams.ConnString()

// TODO: Follow up with tests. We need to issue a checkpoint on the primary prior to us
// starting our recovery, as until the primary checkpoints the global/pg_control file
// won't contain up-to-date information about what timeline the primary exists in.
//
// Imagine everyone is on timeline 1, then we promote a node to timeline 2. Standbys
// attempt to replicate from the newly promoted node but fail due to diverged timelines.
// pg_rewind is then used to resync the standbys, but if the new primary hasn't yet
// checkpointed, the pg_control file will tell us we're both on the same timeline (1)
// and pg_rewind will exit without performing any action.
//
// If we checkpoint before invoking pg_rewind we will avoid this problem, at the slight
// cost of forcing a checkpoint on a newly promoted node, which might hurt performance.
// We (GoCardless) can't afford this, so we take the performance penalty to avoid hours
// of downtime.
if forceCheckpoint {
log.Infow("issuing checkpoint on primary")
psqlName := filepath.Join(p.pgBinPath, "psql")
cmd := exec.Command(psqlName, followedConnString, "-c", "CHECKPOINT;")
cmd.Env = append(os.Environ(), fmt.Sprintf("PGPASSFILE=%s", pgpass.Name()))
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
if err := cmd.Run(); err != nil {
return fmt.Errorf("error: %v", err)
}
}

log.Infow("running pg_rewind")
name := filepath.Join(p.pgBinPath, "pg_rewind")
cmd := exec.Command(name, "--debug", "-D", p.dataDir, "--source-server="+followedConnString)
Expand Down

0 comments on commit 487d25b

Please sign in to comment.