Merge pull request #152 from sp98/BZ-1899743

Bug 1916585: ceph: osd pdb reconciler changes
red-hat-storage · Jan 15, 2021 · 8c7ed58 · 8c7ed58
2 parents 1ae5ac6 + 2ad237f
commit 8c7ed58
Show file tree

Hide file tree

Showing 23 changed files with 760 additions and 1,169 deletions.
diff --git a/cluster/charts/rook-ceph/templates/resources.yaml b/cluster/charts/rook-ceph/templates/resources.yaml
@@ -47,6 +47,8 @@ spec:
                   type: boolean
                 osdMaintenanceTimeout:
                   type: integer
+                pgHealthCheckTimeout:
+                  type: integer
                 manageMachineDisruptionBudgets:
                   type: boolean
             skipUpgradeChecks:
@@ -88,6 +90,8 @@ spec:
                       type: boolean
                     osdMaintenanceTimeout:
                       type: integer
+                    pgHealthCheckTimeout:
+                      type: integer
                     manageMachineDisruptionBudgets:
                       type: boolean
                 useAllNodes:

diff --git a/cluster/examples/kubernetes/ceph/cluster-on-pvc.yaml b/cluster/examples/kubernetes/ceph/cluster-on-pvc.yaml
@@ -163,5 +163,6 @@ spec:
   disruptionManagement:
     managePodBudgets: false
     osdMaintenanceTimeout: 30
+    pgHealthCheckTimeout: 0
     manageMachineDisruptionBudgets: false
     machineDisruptionBudgetNamespace: openshift-machine-api
diff --git a/cluster/examples/kubernetes/ceph/cluster.yaml b/cluster/examples/kubernetes/ceph/cluster.yaml
@@ -206,6 +206,10 @@ spec:
     # A duration in minutes that determines how long an entire failureDomain like `region/zone/host` will be held in `noout` (in addition to the
     # default DOWN/OUT interval) when it is draining. This is only relevant when  `managePodBudgets` is `true`. The default value is `30` minutes.
     osdMaintenanceTimeout: 30
+    # A duration in minutes that the operator will wait for the placement groups to become healthy (active+clean) after a drain was completed and OSDs came back up.
+    # Operator will continue with the next drain if the timeout exceeds. It only works if `managePodBudgets` is `true`.
+    # No values or 0 means that the operator will wait until the placement groups are healthy before unblocking the next drain.
+    pgHealthCheckTimeout: 0
     # If true, the operator will create and manage MachineDisruptionBudgets to ensure OSDs are only fenced when the cluster is healthy.
     # Only available on OpenShift.
     manageMachineDisruptionBudgets: false

diff --git a/cluster/examples/kubernetes/ceph/common.yaml b/cluster/examples/kubernetes/ceph/common.yaml
@@ -66,6 +66,8 @@ spec:
                   type: boolean
                 osdMaintenanceTimeout:
                   type: integer
+                pgHealthCheckTimeout:
+                  type: integer
                 manageMachineDisruptionBudgets:
                   type: boolean
             skipUpgradeChecks:
@@ -107,6 +109,8 @@ spec:
                       type: boolean
                     osdMaintenanceTimeout:
                       type: integer
+                    pgHealthCheckTimeout:
+                      type: integer
                     manageMachineDisruptionBudgets:
                       type: boolean
                 useAllNodes:

diff --git a/design/ceph/ceph-managed-disruptionbudgets.md b/design/ceph/ceph-managed-disruptionbudgets.md
@@ -14,26 +14,37 @@
 
 OSDs do not fit under the single PodDisruptionBudget pattern. Ceph's ability to tolerate pod disruptions in one failure domain is dependent on the overall health of the cluster.
 Even if an upgrade agent were only to drain one node at a time, Ceph would have to wait until there were no undersized PGs before moving on the the next.
-Therefore, we will create a PodDisruptionBudget per failure domain that does not allow any evictions by default.
-When attempts to drain are detected, we will delete PodDisruption budget on one node at a time, progressing to the next only after ceph is healthy enough to avoid data loss/unavailability.
-The failure domain will be determined by the smallest failure domain of all the Ceph Pools in that cluster.
 
-Detecting drains is not easy as they are a client side operation. The client cordons the node and continuously attempts to evict all pods from the node until it succeeds.
-We will use a heuristic to detect drains. We will create a canary deployment for each node with a nodeSelector for that node. Since it is likely that pod will only be
-removed from the node in the event of a drain, we will rely on the assumption that if that pod is not running, that node is being drained. This will not be a dangerous assumption
-as false positives for drains are not dangerous in this use case.
-
-Example flow:
-- A Ceph pool CRD is created.
-- The Rook operator creates a PDB with maxUnvailable of 0 for each failure domain.
-- A cluster upgrade agent wants to perform a kernel upgrade on the nodes.
-- It attempts to drain 1 or more nodes.
-- The drain attempt successfully evicts the canary pod.
-- The Rook operator interprets this as a drain request that it can grant by deleting the PDB.
-- The Rook operator deletes one PDB, and the blocked drain on that failure domain completes.
-- The OSDs on that node comeback up and all the necessary backfilling occurs, and all the osds are active+clean.
-- The Rook operator recreates the PDB on that failure domain.
-- The process is repeated with the subsequent nodes/failure domains.
+The failure domain will be determined by the smallest failure domain of all the Ceph Pools in that cluster.
+We begin with creating a single PodDisruptionBudget for all the OSD with maxUnavailable=1. This will allow one OSD to go down anytime. Once the user drains a node and an OSD goes down, we determine the failure domain for the draining OSD (using the OSD deployment labels). Then we create blocking PodDisruptionBudgets (maxUnavailable=0) for all other failure domains and delete the main PodDisruptionBudget. This blocks OSDs from going down in multiple failure domains simultaneously.
+
+Once the drained OSDs are back and all the pgs are active+clean, that is, the cluster is healed, the main PodDisruptionBudget is added back and the blocking ones are deleted. User can also add a timeout for the pgs to become healthy. If the timeout exceeds, the operator will ignore the pg health, add the main PodDisruptionBudget and delete the blocking ones.
+
+Detecting drains is not easy as they are a client side operation. The client cordons the node and continuously attempts to evict all pods from the node until it succeeds. Whenever an OSD goes into pending state, that is, `ReadyReplicas` count is 0, we assume that some drain operation is happening.
+
+Example scenario:
+
+- Zone x
+  - Node a
+    - osd.0
+    - osd.1
+- Zone y
+  - Node b
+    - osd.2
+    - osd.3
+- Zone z
+  - Node c
+    - osd.4
+    - osd.5
+
+1. Rook Operator creates a single PDB that covers all OSDs with maxUnavailable=1.
+2. When Rook Operator sees an OSD go down (for example, osd.0 goes down):
+   - Create a PDB for each failure domain (zones y and z) with maxUnavailable=0 where the OSD did *not* go down.
+   - Delete the original PDB that covers all OSDs
+   - Now all remaining OSDs in zone x would be allowed to be drained
+3. When Rook sees the OSDs are back up and all PGs are clean
+   - Restore the PDB that covers all OSDs with maxUnavailable=1
+   - Delete the PDBs (in zone y and z) where maxUnavailable=0
 
 An example of an operator that will attempt to do rolling upgrades of nodes is the Machine Config Operator in openshift. Based on what I have seen in
 [SIG cluster lifecycle](https://github.com/kubernetes/community/tree/master/sig-cluster-lifecycle), kubernetes deployments based on cluster-api approach will be

diff --git a/pkg/apis/ceph.rook.io/v1/types.go b/pkg/apis/ceph.rook.io/v1/types.go
@@ -705,10 +705,16 @@ type DisruptionManagementSpec struct {
 	ManagePodBudgets bool `json:"managePodBudgets,omitempty"`
 
 	// OSDMaintenanceTimeout sets how many additional minutes the DOWN/OUT interval is for drained failure domains
-	// it only works if managePodBudgetss is true.
+	// it only works if managePodBudgets is true.
 	// the default is 30 minutes
 	OSDMaintenanceTimeout time.Duration `json:"osdMaintenanceTimeout,omitempty"`
 
+	// PGHealthCheckTimeout is the time (in minutes) that the operator will wait for the placement groups to become
+	// healthy (active+clean) after a drain was completed and OSDs came back up. Rook will continue with the next drain
+	// if the timeout exceeds. It only works if managePodBudgets is true.
+	// No values or 0 means that the operator will wait until the placement groups are healthy before unblocking the next drain.
+	PGHealthCheckTimeout time.Duration `json:"pgHealthCheckTimeout,omitempty"`
+
 	// This enables management of machinedisruptionbudgets
 	ManageMachineDisruptionBudgets bool `json:"manageMachineDisruptionBudgets,omitempty"`
 

diff --git a/pkg/operator/ceph/cluster/osd/labels.go b/pkg/operator/ceph/cluster/osd/labels.go
@@ -19,6 +19,7 @@ package osd
 import (
 	"fmt"
 	"strconv"
+	"strings"
 
 	"github.com/rook/rook/pkg/operator/ceph/controller"
 )
@@ -32,6 +33,8 @@ const (
 	CephDeviceSetPVCIDLabelKey = "ceph.rook.io/DeviceSetPVCId"
 	// OSDOverPVCLabelKey is the Rook PVC label key
 	OSDOverPVCLabelKey = "ceph.rook.io/pvc"
+	// TopologyLocationLabel is the crush location label added to OSD deployments
+	TopologyLocationLabel = "topology-location-%s"
 )
 
 func makeStorageClassDeviceSetPVCLabel(storageClassDeviceSetName, pvcStorageClassDeviceSetPVCId string, setIndex int) map[string]string {
@@ -42,11 +45,26 @@ func makeStorageClassDeviceSetPVCLabel(storageClassDeviceSetName, pvcStorageClas
 	}
 }
 
-func (c *Cluster) getOSDLabels(osdID int, failureDomainValue string, portable bool) map[string]string {
-	stringID := fmt.Sprintf("%d", osdID)
+func (c *Cluster) getOSDLabels(osd OSDInfo, failureDomainValue string, portable bool) map[string]string {
+	stringID := fmt.Sprintf("%d", osd.ID)
 	labels := controller.CephDaemonAppLabels(AppName, c.clusterInfo.Namespace, "osd", stringID, true)
 	labels[OsdIdLabelKey] = stringID
 	labels[FailureDomainKey] = failureDomainValue
 	labels[portableKey] = strconv.FormatBool(portable)
+	for k, v := range getOSDTopologyLocationLabels(osd.Location) {
+		labels[k] = v
+	}
+	return labels
+}
+
+func getOSDTopologyLocationLabels(topologyLocation string) map[string]string {
+	labels := map[string]string{}
+	locations := strings.Split(topologyLocation, " ")
+	for _, location := range locations {
+		loc := strings.Split(location, "=")
+		if len(loc) == 2 {
+			labels[fmt.Sprintf(TopologyLocationLabel, loc[0])] = loc[1]
+		}
+	}
 	return labels
 }
diff --git a/pkg/operator/ceph/cluster/osd/labels_test.go b/pkg/operator/ceph/cluster/osd/labels_test.go
@@ -0,0 +1,31 @@
+/*
+Copyright 2020 The Rook Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package osd
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestOSDTopologyLabels(t *testing.T) {
+	fakeLocation := "root=default host=ocs-deviceset-gp2-1-data-0-wh5wl region=us-east-1 zone=us-east-1c"
+	result := getOSDTopologyLocationLabels(fakeLocation)
+	assert.Equal(t, "us-east-1", result["topology-location-region"])
+	assert.Equal(t, "ocs-deviceset-gp2-1-data-0-wh5wl", result["topology-location-host"])
+	assert.Equal(t, "us-east-1c", result["topology-location-zone"])
+}
diff --git a/pkg/operator/ceph/cluster/osd/osd_test.go b/pkg/operator/ceph/cluster/osd/osd_test.go
@@ -317,6 +317,7 @@ func TestGetPVCHostName(t *testing.T) {
 	clientset := fake.NewSimpleClientset()
 	clusterInfo := &client.ClusterInfo{Namespace: "ns"}
 	c := &Cluster{context: &clusterd.Context{Clientset: clientset}, clusterInfo: clusterInfo}
+	osdInfo := OSDInfo{ID: 23}
 	pvcName := "test-pvc"
 
 	// fail to get the host name when there is no pod or deployment
@@ -329,7 +330,7 @@ func TestGetPVCHostName(t *testing.T) {
 		ObjectMeta: metav1.ObjectMeta{
 			Name:      "osd-23",
 			Namespace: c.clusterInfo.Namespace,
-			Labels:    c.getOSDLabels(23, "", true),
+			Labels:    c.getOSDLabels(osdInfo, "", true),
 		},
 	}
 	k8sutil.AddLabelToDeployment(OSDOverPVCLabelKey, pvcName, osdDeployment)
@@ -350,7 +351,7 @@ func TestGetPVCHostName(t *testing.T) {
 		ObjectMeta: metav1.ObjectMeta{
 			Name:      "osd-23",
 			Namespace: c.clusterInfo.Namespace,
-			Labels:    c.getOSDLabels(23, "", true),
+			Labels:    c.getOSDLabels(osdInfo, "", true),
 		},
 	}
 	osdPod.Labels = map[string]string{OSDOverPVCLabelKey: pvcName}

diff --git a/pkg/operator/ceph/cluster/osd/spec.go b/pkg/operator/ceph/cluster/osd/spec.go
@@ -419,7 +419,7 @@ func (c *Cluster) makeDeployment(osdProps osdProperties, osd OSDInfo, provisionC
 	podTemplateSpec := v1.PodTemplateSpec{
 		ObjectMeta: metav1.ObjectMeta{
 			Name:   AppName,
-			Labels: c.getOSDLabels(osd.ID, failureDomainValue, osdProps.portable),
+			Labels: c.getOSDLabels(osd, failureDomainValue, osdProps.portable),
 		},
 		Spec: v1.PodSpec{
 			RestartPolicy:      v1.RestartPolicyAlways,
@@ -462,7 +462,7 @@ func (c *Cluster) makeDeployment(osdProps osdProperties, osd OSDInfo, provisionC
 		ObjectMeta: metav1.ObjectMeta{
 			Name:      deploymentName,
 			Namespace: c.clusterInfo.Namespace,
-			Labels:    c.getOSDLabels(osd.ID, failureDomainValue, osdProps.portable),
+			Labels:    c.getOSDLabels(osd, failureDomainValue, osdProps.portable),
 		},
 		Spec: apps.DeploymentSpec{
 			Selector: &metav1.LabelSelector{

diff --git a/pkg/operator/ceph/cluster/register_controllers.go b/pkg/operator/ceph/cluster/register_controllers.go
@@ -25,7 +25,6 @@ import (
 	"github.com/rook/rook/pkg/operator/ceph/disruption/controllerconfig"
 	"github.com/rook/rook/pkg/operator/ceph/disruption/machinedisruption"
 	"github.com/rook/rook/pkg/operator/ceph/disruption/machinelabel"
-	"github.com/rook/rook/pkg/operator/ceph/disruption/nodedrain"
 	"github.com/rook/rook/pkg/operator/ceph/file"
 	"github.com/rook/rook/pkg/operator/ceph/nfs"
 	"github.com/rook/rook/pkg/operator/ceph/object"
@@ -45,7 +44,6 @@ var (
 
 // AddToManagerFuncsMaintenance is a list of functions to add all Controllers to the Manager (entrypoint for controller)
 var AddToManagerFuncsMaintenance = []func(manager.Manager, *controllerconfig.Context) error{
-	nodedrain.Add,
 	clusterdisruption.Add,
 }
 

diff --git a/pkg/operator/ceph/disruption/clusterdisruption/add.go b/pkg/operator/ceph/disruption/clusterdisruption/add.go
@@ -18,8 +18,6 @@ package clusterdisruption
 
 import (
 	"github.com/rook/rook/pkg/operator/ceph/disruption/controllerconfig"
-	"github.com/rook/rook/pkg/operator/ceph/disruption/nodedrain"
-	"github.com/rook/rook/pkg/operator/k8sutil"
 
 	"sigs.k8s.io/controller-runtime/pkg/controller"
 	"sigs.k8s.io/controller-runtime/pkg/handler"
@@ -29,8 +27,6 @@ import (
 
 	"github.com/pkg/errors"
 	cephv1 "github.com/rook/rook/pkg/apis/ceph.rook.io/v1"
-	appsv1 "k8s.io/api/apps/v1"
-	policyv1beta1 "k8s.io/api/policy/v1beta1"
 	"k8s.io/apimachinery/pkg/types"
 )
 
@@ -49,11 +45,10 @@ func Add(mgr manager.Manager, context *controllerconfig.Context) error {
 	sharedClusterMap := &ClusterMap{}
 
 	reconcileClusterDisruption := &ReconcileClusterDisruption{
-		client:              mgr.GetClient(),
-		scheme:              mgrScheme,
-		context:             context,
-		clusterMap:          sharedClusterMap,
-		osdCrushLocationMap: &OSDCrushLocationMap{Context: context.ClusterdContext},
+		client:     mgr.GetClient(),
+		scheme:     mgrScheme,
+		context:    context,
+		clusterMap: sharedClusterMap,
 	}
 	reconciler := reconcile.Reconciler(reconcileClusterDisruption)
 	// Create a new controller
@@ -83,74 +78,6 @@ func Add(mgr manager.Manager, context *controllerconfig.Context) error {
 		return err
 	}
 
-	// Watch for PodDisruptionBudgets and enqueue the CephCluster in the namespace
-	err = c.Watch(
-		&source.Kind{Type: &policyv1beta1.PodDisruptionBudget{}},
-		&handler.EnqueueRequestsFromMapFunc{
-			ToRequests: handler.ToRequestsFunc(func(obj handler.MapObject) []reconcile.Request {
-				_, ok := obj.Object.(*policyv1beta1.PodDisruptionBudget)
-				if !ok {
-					// not a pdb, returning empty
-					logger.Errorf("PDB handler received non-PDB")
-					return []reconcile.Request{}
-				}
-				labels := obj.Meta.GetLabels()
-
-				// only enqueue osdDisruptionAppLabels
-				_, ok = labels[osdDisruptionAppName]
-				if !ok {
-					return []reconcile.Request{}
-				}
-				// // The name will be populated in the reconcile
-				namespace := obj.Meta.GetNamespace()
-				req := reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace}}
-
-				return []reconcile.Request{req}
-			}),
-		},
-	)
-	if err != nil {
-		return err
-	}
-
-	// Watch for canary Deployments created by the nodedrain controller and enqueue all Cephclusters
-	err = c.Watch(
-		&source.Kind{Type: &appsv1.Deployment{}},
-		&handler.EnqueueRequestsFromMapFunc{
-			ToRequests: handler.ToRequestsFunc(func(obj handler.MapObject) []reconcile.Request {
-				_, ok := obj.Object.(*appsv1.Deployment)
-				if !ok {
-					// not a Deployment, returning empty
-					logger.Errorf("deployment handler received non-Deployment")
-					return []reconcile.Request{}
-				}
-
-				// don't enqueue if it isn't a canary Deployment
-				labels := obj.Meta.GetLabels()
-				appLabel, ok := labels[k8sutil.AppAttr]
-				if !ok || appLabel != nodedrain.CanaryAppName {
-					return []reconcile.Request{}
-				}
-
-				// Enqueue all CephClusters
-				clusterNamespaces := sharedClusterMap.GetClusterNamespaces()
-				if len(clusterNamespaces) == 0 {
-					return []reconcile.Request{}
-				}
-				reqs := make([]reconcile.Request, 0)
-				for _, namespace := range clusterNamespaces {
-					// The name will be populated in the reconcile
-					reqs = append(reqs, reconcile.Request{NamespacedName: types.NamespacedName{Namespace: namespace}})
-				}
-
-				return reqs
-			}),
-		},
-	)
-	if err != nil {
-		return err
-	}
-
 	// Watch for CephBlockPools and enqueue the CephCluster in the namespace
 	err = c.Watch(&source.Kind{Type: &cephv1.CephBlockPool{}}, enqueueByNamespace)
 	if err != nil {