Skip to content

Commit

Permalink
daemon: make cordon/uncordon more robust and better logging
Browse files Browse the repository at this point in the history
Before marking cordon/uncordon successful,
also check the node.Spec.Unschedulable has been set
correctly.
Also added additional log while performing cordon/uncordon

This is to help debug bugs such as
https://bugzilla.redhat.com/show_bug.cgi?id=2022387

Manual backport of PRs:
- openshift#2829
- openshift#2659
- openshift#2657
  • Loading branch information
sinnykumari committed Dec 2, 2021
1 parent 353796f commit 7a856fc
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 4 deletions.
3 changes: 2 additions & 1 deletion pkg/daemon/daemon.go
Expand Up @@ -1286,7 +1286,8 @@ func (dn *Daemon) completeUpdate(desiredConfigName string) error {
return err
}

dn.logSystem("completed update for config %s", desiredConfigName)
dn.logSystem("Update completed for config %s and node has been successfully uncordoned", desiredConfigName)
dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "Uncordon", fmt.Sprintf("Update completed for config %s and node has been uncordoned", desiredConfigName))

return nil
}
Expand Down
31 changes: 28 additions & 3 deletions pkg/daemon/update.go
Expand Up @@ -186,26 +186,51 @@ func (dn *Daemon) finalizeBeforeReboot(newConfig *mcfgv1.MachineConfig) (retErr
}

func (dn *Daemon) cordonOrUncordonNode(desired bool) error {
verb := "cordon"
if !desired {
verb = "uncordon"
}

backoff := wait.Backoff{
Steps: 5,
Duration: 10 * time.Second,
Factor: 2,
}
var lastErr error
if err := wait.ExponentialBackoff(backoff, func() (bool, error) {
// Log has been added to ensure that MCO is correctly performing cordon/uncordon.
// This should help us with debugging bugs like https://bugzilla.redhat.com/show_bug.cgi?id=2022387
glog.Infof("Initiating %s on node (currently schedulable: %t)", verb, !dn.node.Spec.Unschedulable)
err := drain.RunCordonOrUncordon(dn.drainer, dn.node, desired)
if err != nil {
lastErr = err
glog.Infof("cordon/uncordon failed with: %v, retrying", err)
glog.Infof("%s failed with: %v, retrying", verb, err)
return false, nil
}

// Re-fetch node so that we are not using cached information
var node *corev1.Node
if node, err = dn.nodeLister.Get(dn.node.GetName()); err != nil {
lastErr = err
glog.Errorf("Failed to fetch node %v, retrying", err)
return false, nil
}

if node.Spec.Unschedulable != desired {
// See https://bugzilla.redhat.com/show_bug.cgi?id=2022387
glog.Infof("RunCordonOrUncordon() succeeded but node is still not in %s state, retrying", verb)
return false, nil
}

glog.Infof("%s succeeded on node (currently schedulable: %t)", verb, !node.Spec.Unschedulable)
return true, nil
}); err != nil {
if err == wait.ErrWaitTimeout {
return errors.Wrapf(lastErr, "failed to cordon/uncordon node (%d tries): %v", backoff.Steps, err)
return errors.Wrapf(lastErr, "failed to %s node (%d tries): %v", verb, backoff.Steps, err)
}
return errors.Wrap(err, "failed to cordon/uncordon node")
return errors.Wrapf(err, "failed to %s node", verb)
}

return nil
}

Expand Down

0 comments on commit 7a856fc

Please sign in to comment.