sorintlab · lawrencejones · Jun 12, 2019 · lawrencejones · Jun 12, 2019 · sgotti
diff --git a/cmd/keeper/cmd/keeper.go b/cmd/keeper/cmd/keeper.go
@@ -840,8 +840,10 @@ func (p *PostgresKeeper) resync(db, followedDB *cluster.DB, tryPgrewind bool) er
 	// fallback to pg_basebackup
 	if tryPgrewind && p.usePgrewind(db) {
 		connParams := p.getSUConnParams(db, followedDB)
-		log.Infow("syncing using pg_rewind", "followedDB", followedDB.UID, "keeper", followedDB.Spec.KeeperUID)
-		if err := pgm.SyncFromFollowedPGRewind(connParams, p.pgSUPassword); err != nil {
+		checkpointBeforePgrewind := db.Spec.CheckpointBeforePgrewind
+		log.Infow("syncing using pg_rewind", "followedDB", followedDB.UID,
+			"keeper", followedDB.Spec.KeeperUID, "forcingCheckpoint", checkpointBeforePgrewind)
+		if err := pgm.SyncFromFollowedPGRewind(connParams, p.pgSUPassword, checkpointBeforePgrewind); err != nil {
 			// log pg_rewind error and fallback to pg_basebackup
 			log.Errorw("error syncing with pg_rewind", zap.Error(err))
 		} else {
@@ -1284,19 +1286,18 @@ func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) {
 				tryPgrewind = false
 			}
 
-			// TODO(sgotti) pg_rewind considers databases on the same timeline
-			// as in sync and doesn't check if they diverged at different
-			// position in previous timelines.
-			// So check that the db as been synced or resync again with
-			// pg_rewind disabled. Will need to report this upstream.
-
-			// TODO(sgotti) The rewinded standby needs wal from the master
-			// starting from the common ancestor, if they aren't available the
-			// instance will keep waiting for them, now we assume that if the
-			// instance isn't ready after the start timeout, it's waiting for
-			// wals and we'll force a full resync.
-			// We have to find a better way to detect if a standby is waiting
-			// for unavailable wals.
+			// TODO(sgotti) pg_rewind considers databases on the same timeline as in sync and
+			// doesn't check if they diverged at different position in previous timelines. So
+			// check that the db has been synced or resync again with pg_rewind disabled. Will
+			// need to report this upstream.
+
+			// TODO(sgotti) The rewinded standby needs wal from the master starting from the
+			// common ancestor, if they aren't available the instance will keep waiting for
+			// them, now we assume that if the instance isn't ready after the start timeout,
+			// it's waiting for wals and we'll force a full resync.
+			//
+			// We have to find a better way to detect if a standby is waiting for unavailable
+			// wals.
 			if err = p.resync(db, followedDB, tryPgrewind); err != nil {
 				log.Errorw("failed to resync from followed instance", zap.Error(err))
 				return

diff --git a/cmd/sentinel/cmd/sentinel.go b/cmd/sentinel/cmd/sentinel.go
@@ -383,6 +383,7 @@ func (s *Sentinel) setDBSpecFromClusterSpec(cd *cluster.ClusterData) {
 		db.Spec.RequestTimeout = *clusterSpec.RequestTimeout
 		db.Spec.MaxStandbys = *clusterSpec.MaxStandbys
 		db.Spec.UsePgrewind = *clusterSpec.UsePgrewind
+		db.Spec.CheckpointBeforePgrewind = *clusterSpec.CheckpointBeforePgrewind
 		db.Spec.PGParameters = clusterSpec.PGParameters
 		db.Spec.PGHBA = clusterSpec.PGHBA
 		if db.Spec.FollowConfig != nil && db.Spec.FollowConfig.Type == cluster.FollowTypeExternal {

diff --git a/doc/cluster_spec.md b/doc/cluster_spec.md
@@ -27,6 +27,7 @@ Some options in a running cluster specification can be changed to update the des
 | additionalWalSenders      | number of additional wal_senders in addition to the ones internally defined by stolon, useful to provide enough wal senders for external standbys (changing this value requires an instance restart)                                                                                                                                                                                                                                                                              | no                        | uint16            | 5                                                                                                                                   |
 | additionalMasterReplicationSlots | a list of additional physical replication slots to be created on the master postgres instance. They will be prefixed with `stolon_` (like internal replication slots used for standby replication) to make them "namespaced" from other replication slots. Replication slots starting with `stolon_` and not defined here (and not used for standby replication) will be dropped from the master instance.                                                                                                                                                                | no                        | []string          | null                                                                                                                                |
 | usePgrewind               | try to use pg_rewind for faster instance resyncronization.                                                                                                                                                                                                                                                                                                                                                                                                                        | no                        | bool              | false                                                                                                                               |
+| checkpointBeforePgrewind  | Force a checkpoint against the current master before executing pg_rewind, preventing the rewind racing the checkpointer process after a standby is newly promoted. This will cause increased IO on whatever Postgres node the currently resync'ing Postgres is following as the checkpoint will not immediate, and not respect spread configuration.
 | initMode                  | The cluster initialization mode. Can be *new* or *existing*. *new* means that a new db cluster will be created on a random keeper and the other keepers will sync with it. *existing* means that a keeper (that needs to have an already created db cluster) will be choosed as the initial master and the other keepers will sync with it. In this case the `existingConfig` object needs to be populated.                                                                       | yes                       | string            |                                                                                                                                     |
 | existingConfig            | configuration for initMode of type "existing"                                                                                                                                                                                                                                                                                                                                                                                                                                     | if initMode is "existing" | ExistingConfig    |                                                                                                                                     |
 | mergePgParameters         | merge pgParameters of the initialized db cluster, useful the retain initdb generated parameters when InitMode is new, retain current parameters when initMode is existing or pitr.                                                                                                                                                                                                                                                                                                | no                        | bool              | true                                                                                                                                |

diff --git a/internal/cluster/cluster.go b/internal/cluster/cluster.go
@@ -66,6 +66,7 @@ const (
 	DefaultMaxSynchronousStandbys    uint16           = 1
 	DefaultAdditionalWalSenders                       = 5
 	DefaultUsePgrewind                                = false
+	DefaultCheckpointBeforePgrewind                   = false
 	DefaultMergePGParameter                           = true
 	DefaultRole                      ClusterRole      = ClusterRoleMaster
 	DefaultSUReplAccess              SUReplAccessMode = SUReplAccessAll
@@ -261,6 +262,8 @@ type ClusterSpec struct {
 	AdditionalMasterReplicationSlots []string `json:"additionalMasterReplicationSlots"`
 	// Whether to use pg_rewind
 	UsePgrewind *bool `json:"usePgrewind,omitempty"`
+	// Whether to issue a CHECKPOINT; before attempting a rewind
+	CheckpointBeforePgrewind *bool `json:"checkpointBeforePgrewind,omitempty"`
 	// InitMode defines the cluster initialization mode. Current modes are: new, existing, pitr
 	InitMode *ClusterInitMode `json:"initMode,omitempty"`
 	// Whether to merge pgParameters of the initialized db cluster, useful
@@ -379,6 +382,9 @@ func (os *ClusterSpec) WithDefaults() *ClusterSpec {
 	if s.UsePgrewind == nil {
 		s.UsePgrewind = BoolP(DefaultUsePgrewind)
 	}
+	if s.CheckpointBeforePgrewind == nil {
+		s.CheckpointBeforePgrewind = BoolP(DefaultCheckpointBeforePgrewind)
+	}
 	if s.MinSynchronousStandbys == nil {
 		s.MinSynchronousStandbys = Uint16P(DefaultMinSynchronousStandbys)
 	}
@@ -607,6 +613,8 @@ type DBSpec struct {
 	SynchronousReplication bool `json:"synchronousReplication,omitempty"`
 	// Whether to use pg_rewind
 	UsePgrewind bool `json:"usePgrewind,omitempty"`
+	// Whether to issue a CHECKPOINT; before attempting a rewind
+	CheckpointBeforePgrewind bool `json:"checkpointBeforePgrewind,omitempty"`
 	// AdditionalWalSenders defines the number of additional wal_senders in
 	// addition to the ones internally defined by stolon
 	AdditionalWalSenders uint16 `json:"additionalWalSenders"`

diff --git a/internal/postgresql/postgresql.go b/internal/postgresql/postgresql.go
@@ -760,7 +760,7 @@ func (p *Manager) createPostgresqlAutoConf() error {
 	return nil
 }
 
-func (p *Manager) SyncFromFollowedPGRewind(followedConnParams ConnParams, password string) error {
+func (p *Manager) SyncFromFollowedPGRewind(followedConnParams ConnParams, password string, forceCheckpoint bool) error {
 	// Remove postgresql.auto.conf since pg_rewind will error if it's a symlink to /dev/null
 	pgAutoConfPath := filepath.Join(p.dataDir, postgresAutoConf)
 	if err := os.Remove(pgAutoConfPath); err != nil && !os.IsNotExist(err) {
@@ -786,6 +786,32 @@ func (p *Manager) SyncFromFollowedPGRewind(followedConnParams ConnParams, passwo
 	followedConnParams.Set("options", "-c synchronous_commit=off")
 	followedConnString := followedConnParams.ConnString()
 
+	// We need to issue a checkpoint on the source before pg_rewind'ing as until the primary
+	// checkpoints the global/pg_control file won't contain up-to-date information about
+	// what timeline the primary exists in.
+	//
+	// Imagine everyone is on timeline 1, then we promote a node to timeline 2. Standbys
+	// attempt to replicate from the newly promoted node but fail due to diverged timelines.
+	// pg_rewind is then used to resync the standbys, but if the new primary hasn't yet
+	// checkpointed, the pg_control file will tell us we're both on the same timeline (1)
+	// and pg_rewind will exit without performing any action.
+	//
+	// If we checkpoint before invoking pg_rewind we will avoid this problem, at the slight
+	// cost of forcing a checkpoint on a newly promoted node, which might hurt performance.
+	// We (GoCardless) can't afford this, so we take the performance penalty to avoid hours
+	// of downtime.
+	if forceCheckpoint {
+		log.Infow("issuing checkpoint on primary")
+		psqlName := filepath.Join(p.pgBinPath, "psql")
+		cmd := exec.Command(psqlName, followedConnString, "-c", "CHECKPOINT;")
+		cmd.Env = append(os.Environ(), fmt.Sprintf("PGPASSFILE=%s", pgpass.Name()))
+		cmd.Stdout = os.Stdout
+		cmd.Stderr = os.Stderr
+		if err := cmd.Run(); err != nil {
+			return fmt.Errorf("error: %v", err)
+		}
+	}
+
 	log.Infow("running pg_rewind")
 	name := filepath.Join(p.pgBinPath, "pg_rewind")
 	cmd := exec.Command(name, "--debug", "-D", p.dataDir, "--source-server="+followedConnString)