Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ceph: do not check ok-to-stop when OSDs are in CLBO #8583

Merged
merged 1 commit into from
Aug 25, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
21 changes: 19 additions & 2 deletions pkg/daemon/ceph/client/upgrade.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ const (
var (
// we don't perform any checks on these daemons
// they don't have any "ok-to-stop" command implemented
daemonNoCheck = []string{"mgr", "rgw", "rbd-mirror", "nfs", "fs-mirror"}
daemonNoCheck = []string{"mgr", "rgw", "rbd-mirror", "nfs", "fs-mirror"}
errNoHostInCRUSH = errors.New("no host in crush map yet?")
)

func getCephMonVersionString(context *clusterd.Context, clusterInfo *ClusterInfo) (string, error) {
Expand Down Expand Up @@ -311,7 +312,7 @@ func allOSDsSameHost(context *clusterd.Context, clusterInfo *ClusterInfo) (bool,

hostOsdNodes := len(hostOsdTree.Nodes)
if hostOsdNodes == 0 {
return false, errors.New("no host in crush map yet?")
return false, errNoHostInCRUSH
}

// If the number of OSD node is 1, chances are this is simple setup with all OSDs on it
Expand Down Expand Up @@ -369,6 +370,10 @@ func OSDUpdateShouldCheckOkToStop(context *clusterd.Context, clusterInfo *Cluste
// aio means all in one
aio, err := allOSDsSameHost(context, clusterInfo)
if err != nil {
if errors.Is(err, errNoHostInCRUSH) {
logger.Warning("the CRUSH map has no 'host' entries so not performing ok-to-stop checks")
return false
}
logger.Warningf("failed to determine if all osds are running on the same host. will check if OSDs are ok-to-stop. if all OSDs are running on one host %s. %v", userIntervention, err)
return true
}
Expand Down Expand Up @@ -401,6 +406,18 @@ func osdDoNothing(context *clusterd.Context, clusterInfo *ClusterInfo) bool {
// aio means all in one
aio, err := allOSDsSameHost(context, clusterInfo)
if err != nil {
// We return true so that we can continue without a retry and subsequently not test if the
// osd can be stopped This handles the scenario where the OSDs have been created but not yet
// started due to a wrong CR configuration For instance, when OSDs are encrypted and Vault
// is used to store encryption keys, if the KV version is incorrect during the cluster
// initialization the OSDs will fail to start and stay in CLBO until the CR is updated again
// with the correct KV version so that it can start For this scenario we don't need to go
// through the path where the check whether the OSD can be stopped or not, so it will always
// fail and make us wait for nothing
if errors.Is(err, errNoHostInCRUSH) {
logger.Warning("the CRUSH map has no 'host' entries so not performing ok-to-stop checks")
return true
}
logger.Warningf("failed to determine if all osds are running on the same host, performing upgrade check anyways. %v", err)
return false
}
Expand Down
7 changes: 7 additions & 0 deletions pkg/daemon/ceph/client/upgrade_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -357,4 +357,11 @@ func TestOSDUpdateShouldCheckOkToStop(t *testing.T) {
treeOutput = fake.OsdTreeOutput(0, 0)
assert.False(t, OSDUpdateShouldCheckOkToStop(context, clusterInfo))
})

// degraded case, OSDs are failing to start so they haven't registered in the CRUSH map yet
t.Run("0 nodes with down OSDs", func(t *testing.T) {
lsOutput = fake.OsdLsOutput(3)
treeOutput = fake.OsdTreeOutput(0, 1)
assert.False(t, OSDUpdateShouldCheckOkToStop(context, clusterInfo))
})
}