Automates BZ OCS podDisruptionBudget prevents successful OCP upgrades

Signed-off-by: Shrivaibavi Raghaventhiran <sraghave@redhat.com>
red-hat-storage · Aug 10, 2021 · 91db156 · 91db156
1 parent c43584c
commit 91db156
Show file tree

Hide file tree

Showing 5 changed files with 161 additions and 6 deletions.
diff --git a/ocs_ci/ocs/cluster.py b/ocs_ci/ocs/cluster.py
@@ -19,6 +19,7 @@
 from ocs_ci.ocs.exceptions import UnexpectedBehaviour
 from ocs_ci.ocs.resources import ocs, storage_cluster
 import ocs_ci.ocs.constants as constant
+from ocs_ci.ocs import defaults
 from ocs_ci.ocs.resources.mcg import MCG
 from ocs_ci.utility.retry import retry
 from ocs_ci.utility.utils import (
@@ -1712,6 +1713,46 @@ def check_ceph_health_after_add_capacity(
     ), "Data re-balance failed to complete"
 
 
+def validate_existence_of_blocking_pdb():
+    """
+    Validate creation of PDBs for OSDs.
+    1. Versions lesser than ocs-operator.v4.6.2 have PDBs for each osds
+    2. Versions greater than or equal to ocs-operator.v4.6.2-233.ci have
+    PDBs collectively for osds like rook-ceph-osd
+
+    Returns:
+        bool: True if blocking PDBs are present, false otherwise
+
+    """
+    pdb_obj = ocp.OCP(
+        kind=constants.POD_DISRUPTION_BUDGET, namespace=defaults.ROOK_CLUSTER_NAMESPACE
+    )
+    pdb_obj_get = pdb_obj.get()
+    osd_pdb = []
+    for pdb in pdb_obj_get.get("items"):
+        if (
+            constants.MDS_PDB not in pdb["metadata"]["name"]
+            and constants.MON_PDB not in pdb["metadata"]["name"]
+        ):
+            osd_pdb.append(pdb["metadata"]["name"])
+    blocking_pdb_exist = False
+    if len(osd_pdb) == 1:
+        logger.info(f"No blocking PDBs created, OSD PDB is {osd_pdb}")
+    else:
+        logger.info(f"Blocking PDBs are created {osd_pdb}")
+        for osd in range(len(osd_pdb)):
+            allowed_disruptions = (
+                pdb_obj_get.get("items")[osd].get("status").get("disruptionsAllowed")
+            )
+            maximum_unavailable = (
+                pdb_obj_get.get("items")[osd].get("spec").get("maxUnavailable")
+            )
+            if allowed_disruptions and maximum_unavailable != 1:
+                logger.info("Allowed disruptions and maximum_unavailable count matches")
+                blocking_pdb_exist = True
+    return blocking_pdb_exist
+
+
 class CephClusterExternal(CephCluster):
     """
     Handle all external ceph cluster related functionalities

diff --git a/ocs_ci/ocs/constants.py b/ocs_ci/ocs/constants.py
@@ -268,6 +268,7 @@
 CSI_CEPHFSPLUGIN_LABEL = "app=csi-cephfsplugin"
 CSI_RBDPLUGIN_LABEL = "app=csi-rbdplugin"
 OCS_OPERATOR_LABEL = "name=ocs-operator"
+ROOK_CEPH_DRAIN_CANARY = "rook-ceph-drain-canary"
 LOCAL_STORAGE_OPERATOR_LABEL = "name=local-storage-operator"
 NOOBAA_APP_LABEL = "app=noobaa"
 NOOBAA_CORE_POD_LABEL = "noobaa-core=noobaa"

diff --git a/tests/ecosystem/upgrade/test_upgrade.py b/tests/ecosystem/upgrade/test_upgrade.py
@@ -2,7 +2,10 @@
 
 import pytest
 
-from ocs_ci.framework.testlib import ocs_upgrade, polarion_id
+from ocs_ci.framework.testlib import (
+    ocs_upgrade,
+    polarion_id,
+)
 from ocs_ci.ocs.disruptive_operations import worker_node_shutdown, osd_node_reboot
 from ocs_ci.ocs.ocs_upgrade import run_ocs_upgrade
 from ocs_ci.utility.reporting import get_polarion_id

diff --git a/tests/manage/z_cluster/nodes/test_nodes_maintenance.py b/tests/manage/z_cluster/nodes/test_nodes_maintenance.py
@@ -1,10 +1,12 @@
 import logging
 import pytest
 
+import time
+
 from subprocess import TimeoutExpired
 
 from ocs_ci.ocs.exceptions import CephHealthException, ResourceWrongStatusException
-from ocs_ci.utility.utils import ceph_health_check_base
+from ocs_ci.utility.utils import ceph_health_check_base, TimeoutSampler
 
 from ocs_ci.ocs import constants, machine, ocp, defaults
 from ocs_ci.ocs.node import (
@@ -17,6 +19,7 @@
     get_node_objs,
     add_new_node_and_label_it,
 )
+from ocs_ci.ocs.cluster import validate_existence_of_blocking_pdb
 from ocs_ci.framework.testlib import (
     tier1,
     tier2,
@@ -433,3 +436,82 @@ def test_simultaneous_drain_of_two_ocs_nodes(
 
         # Perform cluster and Ceph health checks
         self.sanity_helpers.health_check()
+
+    def test_pdb_check_simultaneous_node_drains(
+        self,
+        pvc_factory,
+        pod_factory,
+        bucket_factory,
+        rgw_bucket_factory,
+        node_drain_teardown,
+    ):
+        """
+        - Check for OSD PDBs before drain
+        - Maintenance (mark as unschedulable and drain) 2 worker node with delay of 30 secs
+        - Drain will be completed on worker node A
+        - Drain will be pending on worker node B due to blocking PDBs
+        - Check the OSD PDBs
+        - Mark the node A as schedulable
+        - Let drain finish on Node B
+        - Mark the node B as schedulable
+        - Check cluster and Ceph health
+
+        """
+
+        # Validate OSD PDBs before drain operation
+        assert (
+            not validate_existence_of_blocking_pdb()
+        ), "Blocking PDBs exist, Can't perform drain"
+
+        # Get 2 worker nodes to drain
+        typed_nodes = get_nodes(num_of_nodes=2)
+        assert len(typed_nodes) == 2, "Failed to find worker nodes for the test"
+        node_A = typed_nodes[0].name
+        node_B = typed_nodes[1].name
+
+        # Drain Node A and validate blocking PDBs
+        drain_nodes([node_A])
+        assert (
+            validate_existence_of_blocking_pdb()
+        ), "Blocking PDBs not created post drain"
+
+        # Inducing delay between 2 drains
+        # Node-B drain expected to be in pending due to blocking PDBs
+        time.sleep(30)
+        try:
+            drain_nodes([node_B])
+        except TimeoutExpired:
+            # Mark the node-A back to schedulable and let drain finish in Node-B
+            schedule_nodes([node_A])
+
+        time.sleep(40)
+
+        # Validate OSD PDBs
+        assert (
+            validate_existence_of_blocking_pdb()
+        ), "Blocking PDBs not created post second drain"
+
+        # Mark the node-B back to schedulable and recover the cluster
+        schedule_nodes([node_B])
+
+        sample = TimeoutSampler(
+            timeout=100,
+            sleep=10,
+            func=validate_existence_of_blocking_pdb,
+        )
+        if not sample.wait_for_func_status(result=False):
+            log.error("Blocking PDBs still exist")
+
+        # wait for storage pods
+        pod.wait_for_storage_pods()
+
+        # Perform cluster and Ceph health checks
+        self.sanity_helpers.health_check(tries=50)
+
+        # Check basic cluster functionality by creating resources
+        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
+        # run IO and delete the resources
+        self.sanity_helpers.create_resources(
+            pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory
+        )
+        self.sanity_helpers.delete_resources()
diff --git a/tests/manage/z_cluster/upgrade/test_check_mon_pdb_post_upgrade.py b/tests/manage/z_cluster/upgrade/test_check_mon_pdb_post_upgrade.py
@@ -4,22 +4,31 @@
 from semantic_version import Version
 
 from ocs_ci.framework import config
-from ocs_ci.framework.testlib import post_ocs_upgrade, ManageTest, skipif_external_mode
+from ocs_ci.framework.testlib import (
+    post_ocs_upgrade,
+    ManageTest,
+    skipif_external_mode,
+    post_ocp_upgrade,
+)
+from ocs_ci.ocs.ocp import get_ocs_version
 from ocs_ci.ocs.cluster import CephCluster
 from ocs_ci.helpers.helpers import get_mon_pdb
+from ocs_ci.ocs import constants
+from ocs_ci.ocs.cluster import validate_existence_of_blocking_pdb
+from ocs_ci.ocs.resources.pod import get_all_pods
 
 log = logging.getLogger(__name__)
 
 
-@post_ocs_upgrade
-@skipif_external_mode
-@pytest.mark.polarion_id("OCS-2449")
 class TestToCheckMonPDBPostUpgrade(ManageTest):
     """
     Validate post ocs upgrade mon pdb count
 
     """
 
+    @post_ocs_upgrade
+    @skipif_external_mode
+    @pytest.mark.polarion_id("OCS-2449")
     def test_check_mon_pdb_post_upgrade(self):
         """
         Testcase to check disruptions_allowed and minimum
@@ -49,3 +58,22 @@ def test_check_mon_pdb_post_upgrade(self):
             assert (
                 max_unavailable_mon == 1
             ), "Maximum unavailable mon count is not matching"
+
+    @post_ocp_upgrade
+    @post_ocs_upgrade
+    @pytest.mark.polarion_id("OCS-2524")
+    def test_check_osd_pdb_post_upgrade(self):
+        """
+        Test to verify OSD PDBs
+        1. Post OCP and OCS successful upgrades check for OSD PDBs
+        2. Rook-ceph-drain-canary pods disappeared after upgrade of OCS version to ocs-operator.v4.6.2-233.ci
+
+        """
+        assert (
+            not validate_existence_of_blocking_pdb()
+        ), "Blocking PDBs present in the cluster"
+
+        # Check for rook-ceph-canary pods
+        assert get_all_pods(
+            selector=constants.ROOK_CEPH_DRAIN_CANARY
+        ), f"Canary pods not found on {get_ocs_version()}"