Skip to content

Commit

Permalink
upgrade: resilience fixes
Browse files Browse the repository at this point in the history
Code no longer relies on webhook behavior in terms of default values of
upgrade knobs.
When Retry failure strategy is used, operator won't log erros when node
does not become ready, instead it will retry infinitely.
  • Loading branch information
zimnx committed Dec 28, 2020
1 parent d6ec4d3 commit 5bc4e30
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 15 deletions.
9 changes: 7 additions & 2 deletions pkg/api/v1alpha1/cluster_webhook.go
Expand Up @@ -45,6 +45,11 @@ func (r *ScyllaCluster) SetupWebhookWithManager(mgr ctrl.Manager) error {
var _ webhook.Defaulter = &ScyllaCluster{}
var _ webhook.Validator = &ScyllaCluster{}

const (
DefaultGenericUpgradePollInterval = time.Second
DefaultGenericUpgradeValidationTimeout = 30 * time.Minute
)

func (c *ScyllaCluster) Default() {
for _, r := range c.Spec.Datacenter.Racks {
// Empty agent resources
Expand Down Expand Up @@ -107,11 +112,11 @@ func (c *ScyllaCluster) Default() {
}

if c.Spec.GenericUpgrade.ValidationTimeout == nil {
c.Spec.GenericUpgrade.ValidationTimeout = &metav1.Duration{Duration: 30 * time.Minute}
c.Spec.GenericUpgrade.ValidationTimeout = &metav1.Duration{Duration: DefaultGenericUpgradeValidationTimeout}
}

if c.Spec.GenericUpgrade.PollInterval == nil {
c.Spec.GenericUpgrade.PollInterval = &metav1.Duration{Duration: time.Second}
c.Spec.GenericUpgrade.PollInterval = &metav1.Duration{Duration: DefaultGenericUpgradePollInterval}
}
}

Expand Down
45 changes: 32 additions & 13 deletions pkg/controllers/cluster/actions/upgrade_version.go
Expand Up @@ -43,8 +43,9 @@ type ClusterVersionUpgrade struct {
ScyllaClient *scyllaclient.Client
ClusterSession cqlSession

ipMapping map[string]string
pollInterval time.Duration
ipMapping map[string]string
pollInterval time.Duration
validationTimeout time.Duration

currentRack *scyllav1alpha1.RackSpec
currentNode *corev1.Pod
Expand Down Expand Up @@ -140,7 +141,18 @@ func (a *ClusterVersionUpgrade) Execute(ctx context.Context, s *State) error {
a.cc = s.Client
a.kubeClient = s.kubeclient
a.recorder = s.recorder
a.pollInterval = a.Cluster.Spec.GenericUpgrade.PollInterval.Duration

a.pollInterval = scyllav1alpha1.DefaultGenericUpgradePollInterval
a.validationTimeout = scyllav1alpha1.DefaultGenericUpgradeValidationTimeout

if a.Cluster.Spec.GenericUpgrade != nil {
if a.Cluster.Spec.GenericUpgrade.PollInterval != nil {
a.pollInterval = a.Cluster.Spec.GenericUpgrade.PollInterval.Duration
}
if a.Cluster.Spec.GenericUpgrade.ValidationTimeout != nil {
a.validationTimeout = a.Cluster.Spec.GenericUpgrade.ValidationTimeout.Duration
}
}

switch a.upgradeProcedure(ctx) {
case genericUpgradeProcedure:
Expand Down Expand Up @@ -663,8 +675,8 @@ func (a *ClusterVersionUpgrade) deletePod(ctx context.Context) (fsm.Event, error
return ActionSuccess, nil
}

func (a *ClusterVersionUpgrade) validateUpgrade(ctx context.Context) (fsm.Event, error) {
err := wait.PollImmediate(a.pollInterval, a.Cluster.Spec.GenericUpgrade.ValidationTimeout.Duration, func() (done bool, err error) {
func (a *ClusterVersionUpgrade) nodeUpgradedConditionFunc(ctx context.Context) func() (bool, error) {
return func() (bool, error) {
node, err := a.getCurrentNode(ctx)
if err != nil {
return false, errors.Wrap(err, "get current node")
Expand All @@ -684,15 +696,22 @@ func (a *ClusterVersionUpgrade) validateUpgrade(ctx context.Context) (fsm.Event,

a.logger.Debug(ctx, "Node validation", "node", node.Name, "ready", podReady(node), "ver", ver)
return podReady(node) && a.Cluster.Status.Upgrade.ToVersion == ver, nil
})
if err != nil {
if errors.Is(err, wait.ErrWaitTimeout) {
if a.Cluster.Spec.GenericUpgrade.FailureStrategy == scyllav1alpha1.GenericUpgradeFailureStrategyRetry {
// Return error to reschedule reconciliation loop.
return ActionFailure, errors.Wrap(err, "wait for ready pods")
}
}
}

func (a *ClusterVersionUpgrade) validateUpgrade(ctx context.Context) (fsm.Event, error) {
failureStrategy := scyllav1alpha1.GenericUpgradeFailureStrategyRetry
if a.Cluster.Spec.GenericUpgrade != nil {
failureStrategy = a.Cluster.Spec.GenericUpgrade.FailureStrategy
}

switch failureStrategy {
case scyllav1alpha1.GenericUpgradeFailureStrategyRetry:
if err := wait.PollImmediateInfinite(a.pollInterval, a.nodeUpgradedConditionFunc(ctx)); err != nil {
return ActionFailure, errors.Wrap(err, "validate node upgrade")
}
return ActionFailure, errors.Wrap(err, "validate node upgrade")
default:
return ActionFailure, errors.Errorf("unsupported failure strategy %s", failureStrategy)
}

return ActionSuccess, nil
Expand Down

0 comments on commit 5bc4e30

Please sign in to comment.