From 91db1566100b85b623545587bdb74b28bb1f2a6c Mon Sep 17 00:00:00 2001 From: Shrivaibavi Raghaventhiran Date: Mon, 22 Feb 2021 23:25:40 +0530 Subject: [PATCH] Automates BZ OCS podDisruptionBudget prevents successful OCP upgrades Signed-off-by: Shrivaibavi Raghaventhiran --- ocs_ci/ocs/cluster.py | 41 +++++++++ ocs_ci/ocs/constants.py | 1 + tests/ecosystem/upgrade/test_upgrade.py | 5 +- .../z_cluster/nodes/test_nodes_maintenance.py | 84 ++++++++++++++++++- .../test_check_mon_pdb_post_upgrade.py | 36 +++++++- 5 files changed, 161 insertions(+), 6 deletions(-) diff --git a/ocs_ci/ocs/cluster.py b/ocs_ci/ocs/cluster.py index e3f93829bc7b..9c3d103a4ac0 100644 --- a/ocs_ci/ocs/cluster.py +++ b/ocs_ci/ocs/cluster.py @@ -19,6 +19,7 @@ from ocs_ci.ocs.exceptions import UnexpectedBehaviour from ocs_ci.ocs.resources import ocs, storage_cluster import ocs_ci.ocs.constants as constant +from ocs_ci.ocs import defaults from ocs_ci.ocs.resources.mcg import MCG from ocs_ci.utility.retry import retry from ocs_ci.utility.utils import ( @@ -1712,6 +1713,46 @@ def check_ceph_health_after_add_capacity( ), "Data re-balance failed to complete" +def validate_existence_of_blocking_pdb(): + """ + Validate creation of PDBs for OSDs. + 1. Versions lesser than ocs-operator.v4.6.2 have PDBs for each osds + 2. Versions greater than or equal to ocs-operator.v4.6.2-233.ci have + PDBs collectively for osds like rook-ceph-osd + + Returns: + bool: True if blocking PDBs are present, false otherwise + + """ + pdb_obj = ocp.OCP( + kind=constants.POD_DISRUPTION_BUDGET, namespace=defaults.ROOK_CLUSTER_NAMESPACE + ) + pdb_obj_get = pdb_obj.get() + osd_pdb = [] + for pdb in pdb_obj_get.get("items"): + if ( + constants.MDS_PDB not in pdb["metadata"]["name"] + and constants.MON_PDB not in pdb["metadata"]["name"] + ): + osd_pdb.append(pdb["metadata"]["name"]) + blocking_pdb_exist = False + if len(osd_pdb) == 1: + logger.info(f"No blocking PDBs created, OSD PDB is {osd_pdb}") + else: + logger.info(f"Blocking PDBs are created {osd_pdb}") + for osd in range(len(osd_pdb)): + allowed_disruptions = ( + pdb_obj_get.get("items")[osd].get("status").get("disruptionsAllowed") + ) + maximum_unavailable = ( + pdb_obj_get.get("items")[osd].get("spec").get("maxUnavailable") + ) + if allowed_disruptions and maximum_unavailable != 1: + logger.info("Allowed disruptions and maximum_unavailable count matches") + blocking_pdb_exist = True + return blocking_pdb_exist + + class CephClusterExternal(CephCluster): """ Handle all external ceph cluster related functionalities diff --git a/ocs_ci/ocs/constants.py b/ocs_ci/ocs/constants.py index a732a8e53e7b..97f92b6f836d 100644 --- a/ocs_ci/ocs/constants.py +++ b/ocs_ci/ocs/constants.py @@ -268,6 +268,7 @@ CSI_CEPHFSPLUGIN_LABEL = "app=csi-cephfsplugin" CSI_RBDPLUGIN_LABEL = "app=csi-rbdplugin" OCS_OPERATOR_LABEL = "name=ocs-operator" +ROOK_CEPH_DRAIN_CANARY = "rook-ceph-drain-canary" LOCAL_STORAGE_OPERATOR_LABEL = "name=local-storage-operator" NOOBAA_APP_LABEL = "app=noobaa" NOOBAA_CORE_POD_LABEL = "noobaa-core=noobaa" diff --git a/tests/ecosystem/upgrade/test_upgrade.py b/tests/ecosystem/upgrade/test_upgrade.py index 6f48639f17b5..81b584e48501 100644 --- a/tests/ecosystem/upgrade/test_upgrade.py +++ b/tests/ecosystem/upgrade/test_upgrade.py @@ -2,7 +2,10 @@ import pytest -from ocs_ci.framework.testlib import ocs_upgrade, polarion_id +from ocs_ci.framework.testlib import ( + ocs_upgrade, + polarion_id, +) from ocs_ci.ocs.disruptive_operations import worker_node_shutdown, osd_node_reboot from ocs_ci.ocs.ocs_upgrade import run_ocs_upgrade from ocs_ci.utility.reporting import get_polarion_id diff --git a/tests/manage/z_cluster/nodes/test_nodes_maintenance.py b/tests/manage/z_cluster/nodes/test_nodes_maintenance.py index 06d0dc6010c1..92ad14456a2f 100644 --- a/tests/manage/z_cluster/nodes/test_nodes_maintenance.py +++ b/tests/manage/z_cluster/nodes/test_nodes_maintenance.py @@ -1,10 +1,12 @@ import logging import pytest +import time + from subprocess import TimeoutExpired from ocs_ci.ocs.exceptions import CephHealthException, ResourceWrongStatusException -from ocs_ci.utility.utils import ceph_health_check_base +from ocs_ci.utility.utils import ceph_health_check_base, TimeoutSampler from ocs_ci.ocs import constants, machine, ocp, defaults from ocs_ci.ocs.node import ( @@ -17,6 +19,7 @@ get_node_objs, add_new_node_and_label_it, ) +from ocs_ci.ocs.cluster import validate_existence_of_blocking_pdb from ocs_ci.framework.testlib import ( tier1, tier2, @@ -433,3 +436,82 @@ def test_simultaneous_drain_of_two_ocs_nodes( # Perform cluster and Ceph health checks self.sanity_helpers.health_check() + + def test_pdb_check_simultaneous_node_drains( + self, + pvc_factory, + pod_factory, + bucket_factory, + rgw_bucket_factory, + node_drain_teardown, + ): + """ + - Check for OSD PDBs before drain + - Maintenance (mark as unschedulable and drain) 2 worker node with delay of 30 secs + - Drain will be completed on worker node A + - Drain will be pending on worker node B due to blocking PDBs + - Check the OSD PDBs + - Mark the node A as schedulable + - Let drain finish on Node B + - Mark the node B as schedulable + - Check cluster and Ceph health + + """ + + # Validate OSD PDBs before drain operation + assert ( + not validate_existence_of_blocking_pdb() + ), "Blocking PDBs exist, Can't perform drain" + + # Get 2 worker nodes to drain + typed_nodes = get_nodes(num_of_nodes=2) + assert len(typed_nodes) == 2, "Failed to find worker nodes for the test" + node_A = typed_nodes[0].name + node_B = typed_nodes[1].name + + # Drain Node A and validate blocking PDBs + drain_nodes([node_A]) + assert ( + validate_existence_of_blocking_pdb() + ), "Blocking PDBs not created post drain" + + # Inducing delay between 2 drains + # Node-B drain expected to be in pending due to blocking PDBs + time.sleep(30) + try: + drain_nodes([node_B]) + except TimeoutExpired: + # Mark the node-A back to schedulable and let drain finish in Node-B + schedule_nodes([node_A]) + + time.sleep(40) + + # Validate OSD PDBs + assert ( + validate_existence_of_blocking_pdb() + ), "Blocking PDBs not created post second drain" + + # Mark the node-B back to schedulable and recover the cluster + schedule_nodes([node_B]) + + sample = TimeoutSampler( + timeout=100, + sleep=10, + func=validate_existence_of_blocking_pdb, + ) + if not sample.wait_for_func_status(result=False): + log.error("Blocking PDBs still exist") + + # wait for storage pods + pod.wait_for_storage_pods() + + # Perform cluster and Ceph health checks + self.sanity_helpers.health_check(tries=50) + + # Check basic cluster functionality by creating resources + # (pools, storageclasses, PVCs, pods - both CephFS and RBD), + # run IO and delete the resources + self.sanity_helpers.create_resources( + pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory + ) + self.sanity_helpers.delete_resources() diff --git a/tests/manage/z_cluster/upgrade/test_check_mon_pdb_post_upgrade.py b/tests/manage/z_cluster/upgrade/test_check_mon_pdb_post_upgrade.py index c6f4847fe647..57b6cba2504e 100644 --- a/tests/manage/z_cluster/upgrade/test_check_mon_pdb_post_upgrade.py +++ b/tests/manage/z_cluster/upgrade/test_check_mon_pdb_post_upgrade.py @@ -4,22 +4,31 @@ from semantic_version import Version from ocs_ci.framework import config -from ocs_ci.framework.testlib import post_ocs_upgrade, ManageTest, skipif_external_mode +from ocs_ci.framework.testlib import ( + post_ocs_upgrade, + ManageTest, + skipif_external_mode, + post_ocp_upgrade, +) +from ocs_ci.ocs.ocp import get_ocs_version from ocs_ci.ocs.cluster import CephCluster from ocs_ci.helpers.helpers import get_mon_pdb +from ocs_ci.ocs import constants +from ocs_ci.ocs.cluster import validate_existence_of_blocking_pdb +from ocs_ci.ocs.resources.pod import get_all_pods log = logging.getLogger(__name__) -@post_ocs_upgrade -@skipif_external_mode -@pytest.mark.polarion_id("OCS-2449") class TestToCheckMonPDBPostUpgrade(ManageTest): """ Validate post ocs upgrade mon pdb count """ + @post_ocs_upgrade + @skipif_external_mode + @pytest.mark.polarion_id("OCS-2449") def test_check_mon_pdb_post_upgrade(self): """ Testcase to check disruptions_allowed and minimum @@ -49,3 +58,22 @@ def test_check_mon_pdb_post_upgrade(self): assert ( max_unavailable_mon == 1 ), "Maximum unavailable mon count is not matching" + + @post_ocp_upgrade + @post_ocs_upgrade + @pytest.mark.polarion_id("OCS-2524") + def test_check_osd_pdb_post_upgrade(self): + """ + Test to verify OSD PDBs + 1. Post OCP and OCS successful upgrades check for OSD PDBs + 2. Rook-ceph-drain-canary pods disappeared after upgrade of OCS version to ocs-operator.v4.6.2-233.ci + + """ + assert ( + not validate_existence_of_blocking_pdb() + ), "Blocking PDBs present in the cluster" + + # Check for rook-ceph-canary pods + assert get_all_pods( + selector=constants.ROOK_CEPH_DRAIN_CANARY + ), f"Canary pods not found on {get_ocs_version()}"