Merge pull request #12912 from rook/mergify/bp/release-1.12/pr-12715

osd: make osd pod to sleep when osds are flapping (backport #12715)
rook · Sep 15, 2023 · 9fa0745 · 9fa0745
2 parents e662d75 + 639b1ae
commit 9fa0745
Show file tree

Hide file tree

Showing 13 changed files with 137 additions and 33 deletions.
diff --git a/Documentation/CRDs/Cluster/ceph-cluster-crd.md b/Documentation/CRDs/Cluster/ceph-cluster-crd.md
@@ -85,6 +85,7 @@ For more details on the mons and when to choose a number other than `3`, see the
     * `onlyApplyOSDPlacement`: Whether the placement specific for OSDs is merged with the `all` placement. If `false`, the OSD placement will be merged with the `all` placement. If true, the `OSD placement will be applied` and the `all` placement will be ignored. The placement for OSDs is computed from several different places depending on the type of OSD:
         * For non-PVCs: `placement.all` and `placement.osd`
         * For PVCs: `placement.all` and inside the storageClassDeviceSets from the `placement` or `preparePlacement`
+    * `flappingRestartIntervalHours`: Defines the time for which an OSD pod will sleep before restarting, if it stopped due to flapping. Flapping occurs where OSDs are marked `down` by Ceph more than 5 times in 600 seconds. The OSDs will stay down when flapping since they likely have a bad disk or other issue that needs investigation. The default is 24 hours. If the issue with the OSD is fixed manually, the OSD pod can be manually restarted.
 * `disruptionManagement`: The section for configuring management of daemon disruptions
     * `managePodBudgets`: if `true`, the operator will create and manage PodDisruptionBudgets for OSD, Mon, RGW, and MDS daemons. OSD PDBs are managed dynamically via the strategy outlined in the [design](https://github.com/rook/rook/blob/master/design/ceph/ceph-managed-disruptionbudgets.md). The operator will block eviction of OSDs by default and unblock them safely when drains are detected.
     * `osdMaintenanceTimeout`: is a duration in minutes that determines how long an entire failureDomain like `region/zone/host` will be held in `noout` (in addition to the default DOWN/OUT interval) when it is draining. This is only relevant when  `managePodBudgets` is `true`. The default value is `30` minutes.

diff --git a/Documentation/CRDs/specification.md b/Documentation/CRDs/specification.md
@@ -11498,6 +11498,23 @@ OSDStore
 <em>(Optional)</em>
 </td>
 </tr>
+<tr>
+<td>
+<code>flappingRestartIntervalHours</code><br/>
+<em>
+int
+</em>
+</td>
+<td>
+<em>(Optional)</em>
+<p>FlappingRestartIntervalHours defines the time for which the OSD pods, that failed with zero exit code, will sleep before restarting.
+This is needed for OSD flapping where OSD daemons are marked down more than 5 times in 600 seconds by Ceph.
+Preventing the OSD pods to restart immediately in such scenarios will prevent Rook from marking OSD as <code>up</code> and thus
+peering of the PGs mapped to the OSD.
+The interval defaults to 24 hours if no value is provided. User needs to manually restart the OSD pod if they manage to fix
+the underlying OSD flapping issue before the restart interval.</p>
+</td>
+</tr>
 </tbody>
 </table>
 <h3 id="ceph.rook.io/v1.StoreType">StoreType

diff --git a/deploy/charts/rook-ceph/templates/resources.yaml b/deploy/charts/rook-ceph/templates/resources.yaml
@@ -2897,6 +2897,9 @@ spec:
                       nullable: true
                       type: array
                       x-kubernetes-preserve-unknown-fields: true
+                    flappingRestartIntervalHours:
+                      description: FlappingRestartIntervalHours defines the time for which the OSD pods, that failed with zero exit code, will sleep before restarting. This is needed for OSD flapping where OSD daemons are marked down more than 5 times in 600 seconds by Ceph. Preventing the OSD pods to restart immediately in such scenarios will prevent Rook from marking OSD as `up` and thus peering of the PGs mapped to the OSD. The interval defaults to 24 hours if no value is provided. User needs to manually restart the OSD pod if they manage to fix the underlying OSD flapping issue before the restart interval.
+                      type: integer
                     nodes:
                       items:
                         description: Node is a storage nodes

diff --git a/deploy/examples/cluster.yaml b/deploy/examples/cluster.yaml
@@ -261,6 +261,8 @@ spec:
     #     deviceFilter: "^sd."
     # when onlyApplyOSDPlacement is false, will merge both placement.All() and placement.osd
     onlyApplyOSDPlacement: false
+    # Time for which an OSD pod will sleep before restarting, if it stopped due to flapping
+    # flappingRestartIntervalHours: 24
   # The section for configuring management of daemon disruptions during upgrade or fencing.
   disruptionManagement:
     # If true, the operator will create and manage PodDisruptionBudgets for OSD, Mon, RGW, and MDS daemons. OSD PDBs are managed dynamically

diff --git a/deploy/examples/crds.yaml b/deploy/examples/crds.yaml
@@ -2895,6 +2895,9 @@ spec:
                       nullable: true
                       type: array
                       x-kubernetes-preserve-unknown-fields: true
+                    flappingRestartIntervalHours:
+                      description: FlappingRestartIntervalHours defines the time for which the OSD pods, that failed with zero exit code, will sleep before restarting. This is needed for OSD flapping where OSD daemons are marked down more than 5 times in 600 seconds by Ceph. Preventing the OSD pods to restart immediately in such scenarios will prevent Rook from marking OSD as `up` and thus peering of the PGs mapped to the OSD. The interval defaults to 24 hours if no value is provided. User needs to manually restart the OSD pod if they manage to fix the underlying OSD flapping issue before the restart interval.
+                      type: integer
                     nodes:
                       items:
                         description: Node is a storage nodes

diff --git a/pkg/apis/ceph.rook.io/v1/types.go b/pkg/apis/ceph.rook.io/v1/types.go
@@ -2679,6 +2679,14 @@ type StorageScopeSpec struct {
 	StorageClassDeviceSets []StorageClassDeviceSet `json:"storageClassDeviceSets,omitempty"`
 	// +optional
 	Store OSDStore `json:"store,omitempty"`
+	// +optional
+	// FlappingRestartIntervalHours defines the time for which the OSD pods, that failed with zero exit code, will sleep before restarting.
+	// This is needed for OSD flapping where OSD daemons are marked down more than 5 times in 600 seconds by Ceph.
+	// Preventing the OSD pods to restart immediately in such scenarios will prevent Rook from marking OSD as `up` and thus
+	// peering of the PGs mapped to the OSD.
+	// The interval defaults to 24 hours if no value is provided. User needs to manually restart the OSD pod if they manage to fix
+	// the underlying OSD flapping issue before the restart interval.
+	FlappingRestartIntervalHours int `json:"flappingRestartIntervalHours"`
 }
 
 // OSDStore is the backend storage type used for creating the OSDs

diff --git a/pkg/apis/ceph.rook.io/v1/zz_generated.deepcopy.go b/pkg/apis/ceph.rook.io/v1/zz_generated.deepcopy.go
diff --git a/pkg/operator/ceph/cluster/osd/osd.go b/pkg/operator/ceph/cluster/osd/osd.go
@@ -22,6 +22,7 @@ import (
 	"context"
 	"fmt"
 	"reflect"
+	"regexp"
 	"sort"
 	"strconv"
 	"strings"
@@ -595,13 +596,11 @@ func (c *Cluster) getOSDInfo(d *appsv1.Deployment) (OSDInfo, error) {
 	}
 
 	locationFound := false
-	for _, a := range container.Args {
+	for _, a := range container.Command {
 		locationPrefix := "--crush-location="
-		if strings.HasPrefix(a, locationPrefix) {
+		if strings.Contains(a, locationPrefix) {
 			locationFound = true
-			// Extract the same CRUSH location as originally determined by the OSD prepare pod
-			// by cutting off the prefix: --crush-location=
-			osd.Location = a[len(locationPrefix):]
+			osd.Location = getLocationWithRegex(a)
 		}
 	}
 
@@ -944,3 +943,12 @@ func (c *Cluster) getOSDStoreStatus() (*cephv1.OSDStatus, error) {
 		StoreType: storeType,
 	}, nil
 }
+
+func getLocationWithRegex(input string) string {
+	rx := regexp.MustCompile(`--crush-location="(.+?)"`)
+	match := rx.FindStringSubmatch(input)
+	if len(match) == 2 {
+		return strings.TrimSpace(match[1])
+	}
+	return ""
+}
diff --git a/pkg/operator/ceph/cluster/osd/osd_test.go b/pkg/operator/ceph/cluster/osd/osd_test.go
@@ -974,3 +974,14 @@ func TestUpdateCephStorageStatus(t *testing.T) {
 		assert.Equal(t, 1, cephCluster.Status.CephStorage.OSD.StoreType["bluestore"])
 	})
 }
+
+func TestGetLocationWithRegex(t *testing.T) {
+	location := getLocationWithRegex("")
+	assert.Equal(t, "", location)
+
+	location = getLocationWithRegex(`ceph-osd --crush-location="root=default host=node" --default-log-to-stderr=true`)
+	assert.Equal(t, "root=default host=node", location)
+
+	location = getLocationWithRegex(`ceph-osd --crush-location="" --default-log-to-stderr=true`)
+	assert.Equal(t, "", location)
+}
diff --git a/pkg/operator/ceph/cluster/osd/spec.go b/pkg/operator/ceph/cluster/osd/spec.go
@@ -22,6 +22,7 @@ import (
 	"path"
 	"path/filepath"
 	"strconv"
+	"strings"
 
 	"github.com/pkg/errors"
 	cephv1 "github.com/rook/rook/pkg/apis/ceph.rook.io/v1"
@@ -61,16 +62,36 @@ const (
 	// DmcryptMetadataType is a portion of the device mapper name for the encrypted OSD on PVC block
 	DmcryptMetadataType = "db-dmcrypt"
 	// DmcryptWalType is a portion of the device mapper name for the encrypted OSD on PVC wal
-	DmcryptWalType        = "wal-dmcrypt"
-	bluestoreBlockName    = "block"
-	bluestoreMetadataName = "block.db"
-	bluestoreWalName      = "block.wal"
-	tempEtcCephDir        = "/etc/temp-ceph"
-	osdPortv1             = 6801
-	osdPortv2             = 6800
+	DmcryptWalType            = "wal-dmcrypt"
+	bluestoreBlockName        = "block"
+	bluestoreMetadataName     = "block.db"
+	bluestoreWalName          = "block.wal"
+	tempEtcCephDir            = "/etc/temp-ceph"
+	osdPortv1                 = 6801
+	osdPortv2                 = 6800
+	defaultOSDRestartInterval = 24
 )
 
 const (
+	cephOSDStart = `
+function sigterm() {
+	echo "SIGTERM received"
+	exit
+}
+
+trap sigterm SIGTERM
+
+%s %s & wait
+
+RESTART_INTERVAL=%d
+rc=$?
+if [ $rc -eq 0 ]; then
+	touch /tmp/osd-sleep
+	echo "OSD daemon exited with code 0, possibly due to OSD flapping. The OSD pod will sleep for $RESTART_INTERVAL hours. Restart the pod manually once the flapping issue is fixed"
+	sleep "$RESTART_INTERVAL"h & wait
+	exit $rc
+fi`
+
 	activateOSDOnNodeCode = `
 set -o errexit
 set -o pipefail
@@ -400,7 +421,7 @@ func (c *Cluster) makeDeployment(osdProps osdProperties, osd OSDInfo, provisionC
 		"--fsid", c.clusterInfo.FSID,
 		"--setuser", "ceph",
 		"--setgroup", "ceph",
-		fmt.Sprintf("--crush-location=%s", osd.Location),
+		fmt.Sprintf("--crush-location=%q", osd.Location),
 	}...)
 
 	// Ceph expects initial weight as float value in tera-bytes units
@@ -598,8 +619,7 @@ func (c *Cluster) makeDeployment(osdProps osdProperties, osd OSDInfo, provisionC
 			InitContainers:     initContainers,
 			Containers: []v1.Container{
 				{
-					Command:         command,
-					Args:            args,
+					Command:         osdStartScript(command, args, c.spec.Storage.FlappingRestartIntervalHours),
 					Name:            "osd",
 					Image:           c.spec.CephVersion.Image,
 					ImagePullPolicy: controller.GetContainerImagePullPolicy(c.spec.CephVersion.ImagePullPolicy),
@@ -1396,3 +1416,17 @@ func (c *Cluster) getOSDServicePorts() []v1.ServicePort {
 
 	return ports
 }
+
+func osdStartScript(cmd, args []string, interval int) []string {
+	osdRestartInterval := defaultOSDRestartInterval
+	if interval != 0 {
+		osdRestartInterval = interval
+	}
+
+	return []string{
+		"/bin/bash",
+		"-c",
+		"-x",
+		fmt.Sprintf(cephOSDStart, strings.Join(cmd, " "), strings.Join(args, " "), osdRestartInterval),
+	}
+}
diff --git a/pkg/operator/ceph/cluster/osd/spec_test.go b/pkg/operator/ceph/cluster/osd/spec_test.go
@@ -178,7 +178,6 @@ func testPodDevices(t *testing.T, dataDir, deviceName string, allDevices bool) {
 	cont := deployment.Spec.Template.Spec.Containers[0]
 	assert.Equal(t, spec.CephVersion.Image, cont.Image)
 	assert.Equal(t, 8, len(cont.VolumeMounts))
-	assert.Equal(t, "ceph-osd", cont.Command[0])
 	verifyEnvVar(t, cont.Env, "TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES", "134217728", true)
 
 	// Test OSD on PVC with LVM
@@ -434,15 +433,15 @@ func testPodDevices(t *testing.T, dataDir, deviceName string, allDevices bool) {
 	deployment, err = c.makeDeployment(osdProp, osd, dataPathMap)
 	assert.NoError(t, err)
 	for _, flag := range defaultTuneFastSettings {
-		assert.Contains(t, deployment.Spec.Template.Spec.Containers[0].Args, flag)
+		assert.Contains(t, deployment.Spec.Template.Spec.Containers[0].Command[3], flag)
 	}
 
 	// Test tune Slow settings when OSD on PVC
 	osdProp.tuneSlowDeviceClass = true
 	deployment, err = c.makeDeployment(osdProp, osd, dataPathMap)
 	assert.NoError(t, err)
 	for _, flag := range defaultTuneSlowSettings {
-		assert.Contains(t, deployment.Spec.Template.Spec.Containers[0].Args, flag)
+		assert.Contains(t, deployment.Spec.Template.Spec.Containers[0].Command[3], flag)
 	}
 
 	// Test shareProcessNamespace presence

diff --git a/pkg/operator/ceph/controller/spec.go b/pkg/operator/ceph/controller/spec.go
@@ -31,6 +31,7 @@ import (
 	"github.com/rook/rook/pkg/clusterd"
 	"github.com/rook/rook/pkg/daemon/ceph/client"
 	"github.com/rook/rook/pkg/operator/ceph/config"
+	opconfig "github.com/rook/rook/pkg/operator/ceph/config"
 	"github.com/rook/rook/pkg/operator/ceph/config/keyring"
 	"github.com/rook/rook/pkg/operator/k8sutil"
 	"github.com/rook/rook/pkg/util/display"
@@ -75,6 +76,26 @@ type daemonConfig struct {
 var logger = capnslog.NewPackageLogger("github.com/rook/rook", "ceph-spec")
 
 var (
+	osdLivenessProbeScript = `
+outp="$(ceph --admin-daemon %s %s 2>&1)"
+rc=$?
+if [ $rc -ne 0 ] && [ ! -f /tmp/osd-sleep ]; then
+	echo "ceph daemon health check failed with the following output:"
+	echo "$outp" | sed -e 's/^/> /g'
+	exit $rc
+fi
+`
+
+	livenessProbeScript = `
+outp="$(ceph --admin-daemon %s %s 2>&1)"
+rc=$?
+if [ $rc -ne 0 ]; then
+	echo "ceph daemon health check failed with the following output:"
+	echo "$outp" | sed -e 's/^/> /g'
+	exit $rc
+fi
+`
+
 	cronLogRotate = `
 CEPH_CLIENT_ID=%s
 PERIODICITY=%s
@@ -619,6 +640,10 @@ func StoredLogAndCrashVolumeMount(varLogCephDir, varLibCephCrashDir string) []v1
 // that it can be called, and that it returns 0
 func GenerateLivenessProbeExecDaemon(daemonType, daemonID string) *v1.Probe {
 	confDaemon := getDaemonConfig(daemonType, daemonID)
+	probeScript := livenessProbeScript
+	if daemonType == opconfig.OsdType {
+		probeScript = osdLivenessProbeScript
+	}
 
 	return &v1.Probe{
 		ProbeHandler: v1.ProbeHandler{
@@ -637,14 +662,7 @@ func GenerateLivenessProbeExecDaemon(daemonType, daemonID string) *v1.Probe {
 					"-i",
 					"sh",
 					"-c",
-					fmt.Sprintf(`outp="$(ceph --admin-daemon %s %s 2>&1)"
-rc=$?
-if [ $rc -ne 0 ]; then
-  echo "ceph daemon health check failed with the following output:"
-  echo "$outp" | sed -e 's/^/> /g'
-  exit $rc
-fi`,
-						confDaemon.buildSocketPath(), confDaemon.buildAdminSocketCommand()),
+					fmt.Sprintf(probeScript, confDaemon.buildSocketPath(), confDaemon.buildAdminSocketCommand()),
 				},
 			},
 		},

diff --git a/pkg/operator/ceph/controller/spec_test.go b/pkg/operator/ceph/controller/spec_test.go
@@ -159,13 +159,8 @@ func TestGenerateLivenessProbeExecDaemon(t *testing.T) {
 		"-i",
 		"sh",
 		"-c",
-		`outp="$(ceph --admin-daemon /run/ceph/ceph-osd.0.asok status 2>&1)"
-rc=$?
-if [ $rc -ne 0 ]; then
-  echo "ceph daemon health check failed with the following output:"
-  echo "$outp" | sed -e 's/^/> /g'
-  exit $rc
-fi`}
+		fmt.Sprintf(osdLivenessProbeScript, "/run/ceph/ceph-osd.0.asok", "status"),
+	}
 
 	assert.Equal(t, expectedCommand, probe.ProbeHandler.Exec.Command)
 	assert.Equal(t, livenessProbeInitialDelaySeconds, probe.InitialDelaySeconds)