diff --git a/ocs_ci/ocs/cluster.py b/ocs_ci/ocs/cluster.py index e3f93829bc7..6fd01255290 100644 --- a/ocs_ci/ocs/cluster.py +++ b/ocs_ci/ocs/cluster.py @@ -19,6 +19,7 @@ from ocs_ci.ocs.exceptions import UnexpectedBehaviour from ocs_ci.ocs.resources import ocs, storage_cluster import ocs_ci.ocs.constants as constant +from ocs_ci.ocs import defaults from ocs_ci.ocs.resources.mcg import MCG from ocs_ci.utility.retry import retry from ocs_ci.utility.utils import ( @@ -1712,6 +1713,42 @@ def check_ceph_health_after_add_capacity( ), "Data re-balance failed to complete" +def validate_existence_of_blocking_pdb(): + """ + Validate creation of PDBs for OSDs. + 1. Versions lesser than ocs-operator.v4.6.2 have PDBs for each osds + 2. Versions greater than or equal to ocs-operator.v4.6.2-233.ci have + PDBs collectively for osds like rook-ceph-osd + + Returns: + bool: True if blocking PDBs are present, false otherwise + + """ + pdb_obj = ocp.OCP( + kind=constants.POD_DISRUPTION_BUDGET, namespace=defaults.ROOK_CLUSTER_NAMESPACE + ) + pdb_obj_get = pdb_obj.get() + osd_pdb = [] + for pdb in pdb_obj_get.get("items"): + if not any( + osd in pdb["metadata"]["name"] + for osd in [constants.MDS_PDB, constants.MON_PDB] + ): + osd_pdb.append(pdb) + blocking_pdb_exist = False + for osd in range(len(osd_pdb)): + allowed_disruptions = osd_pdb[osd].get("status").get("disruptionsAllowed") + maximum_unavailable = osd_pdb[osd].get("spec").get("maxUnavailable") + if allowed_disruptions & maximum_unavailable != 1: + logger.info("Blocking PDBs are created") + blocking_pdb_exist = True + else: + logger.info( + f"No blocking PDBs created, OSD PDB is {osd_pdb[osd].get('metadata').get('name')}" + ) + return blocking_pdb_exist + + class CephClusterExternal(CephCluster): """ Handle all external ceph cluster related functionalities diff --git a/ocs_ci/ocs/constants.py b/ocs_ci/ocs/constants.py index 62177f51957..2e5ddc4e623 100644 --- a/ocs_ci/ocs/constants.py +++ b/ocs_ci/ocs/constants.py @@ -270,6 +270,7 @@ CSI_CEPHFSPLUGIN_LABEL = "app=csi-cephfsplugin" CSI_RBDPLUGIN_LABEL = "app=csi-rbdplugin" OCS_OPERATOR_LABEL = "name=ocs-operator" +ROOK_CEPH_DRAIN_CANARY = "rook-ceph-drain-canary" LOCAL_STORAGE_OPERATOR_LABEL = "name=local-storage-operator" NOOBAA_APP_LABEL = "app=noobaa" NOOBAA_CORE_POD_LABEL = "noobaa-core=noobaa" diff --git a/tests/ecosystem/upgrade/test_upgrade.py b/tests/ecosystem/upgrade/test_upgrade.py index 6f48639f17b..81b584e4850 100644 --- a/tests/ecosystem/upgrade/test_upgrade.py +++ b/tests/ecosystem/upgrade/test_upgrade.py @@ -2,7 +2,10 @@ import pytest -from ocs_ci.framework.testlib import ocs_upgrade, polarion_id +from ocs_ci.framework.testlib import ( + ocs_upgrade, + polarion_id, +) from ocs_ci.ocs.disruptive_operations import worker_node_shutdown, osd_node_reboot from ocs_ci.ocs.ocs_upgrade import run_ocs_upgrade from ocs_ci.utility.reporting import get_polarion_id diff --git a/tests/manage/z_cluster/nodes/test_nodes_maintenance.py b/tests/manage/z_cluster/nodes/test_nodes_maintenance.py index 06d0dc6010c..dcc6eacb064 100644 --- a/tests/manage/z_cluster/nodes/test_nodes_maintenance.py +++ b/tests/manage/z_cluster/nodes/test_nodes_maintenance.py @@ -1,10 +1,12 @@ import logging import pytest +import time + from subprocess import TimeoutExpired from ocs_ci.ocs.exceptions import CephHealthException, ResourceWrongStatusException -from ocs_ci.utility.utils import ceph_health_check_base +from ocs_ci.utility.utils import ceph_health_check_base, TimeoutSampler from ocs_ci.ocs import constants, machine, ocp, defaults from ocs_ci.ocs.node import ( @@ -17,6 +19,7 @@ get_node_objs, add_new_node_and_label_it, ) +from ocs_ci.ocs.cluster import validate_existence_of_blocking_pdb from ocs_ci.framework.testlib import ( tier1, tier2, @@ -28,6 +31,7 @@ ignore_leftovers, ipi_deployment_required, skipif_bm, + bugzilla, ) from ocs_ci.helpers.sanity_helpers import Sanity, SanityExternalCluster from ocs_ci.ocs.resources import pod @@ -433,3 +437,85 @@ def test_simultaneous_drain_of_two_ocs_nodes( # Perform cluster and Ceph health checks self.sanity_helpers.health_check() + + @bugzilla("1861104") + @pytest.mark.polarion_id("OCS-2524") + @tier4b + def test_pdb_check_simultaneous_node_drains( + self, + pvc_factory, + pod_factory, + bucket_factory, + rgw_bucket_factory, + node_drain_teardown, + ): + """ + - Check for OSD PDBs before drain + - Maintenance (mark as unschedulable and drain) 2 worker node with delay of 30 secs + - Drain will be completed on worker node A + - Drain will be pending on worker node B due to blocking PDBs + - Check the OSD PDBs + - Mark the node A as schedulable + - Let drain finish on Node B + - Mark the node B as schedulable + - Check cluster and Ceph health + + """ + + # Validate OSD PDBs before drain operation + assert ( + not validate_existence_of_blocking_pdb() + ), "Blocking PDBs exist, Can't perform drain" + + # Get 2 worker nodes to drain + typed_nodes = get_nodes(num_of_nodes=2) + assert len(typed_nodes) == 2, "Failed to find worker nodes for the test" + node_A = typed_nodes[0].name + node_B = typed_nodes[1].name + + # Drain Node A and validate blocking PDBs + drain_nodes([node_A]) + assert ( + validate_existence_of_blocking_pdb() + ), "Blocking PDBs not created post drain" + + # Inducing delay between 2 drains + # Node-B drain expected to be in pending due to blocking PDBs + time.sleep(30) + try: + drain_nodes([node_B]) + except TimeoutExpired: + # Mark the node-A back to schedulable and let drain finish in Node-B + schedule_nodes([node_A]) + + time.sleep(40) + + # Validate OSD PDBs + assert ( + validate_existence_of_blocking_pdb() + ), "Blocking PDBs not created post second drain" + + # Mark the node-B back to schedulable and recover the cluster + schedule_nodes([node_B]) + + sample = TimeoutSampler( + timeout=100, + sleep=10, + func=validate_existence_of_blocking_pdb, + ) + if not sample.wait_for_func_status(result=False): + log.error("Blocking PDBs still exist") + + # wait for storage pods + pod.wait_for_storage_pods() + + # Perform cluster and Ceph health checks + self.sanity_helpers.health_check(tries=50) + + # Check basic cluster functionality by creating resources + # (pools, storageclasses, PVCs, pods - both CephFS and RBD), + # run IO and delete the resources + self.sanity_helpers.create_resources( + pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory + ) + self.sanity_helpers.delete_resources() diff --git a/tests/manage/z_cluster/upgrade/test_check_mon_pdb_post_upgrade.py b/tests/manage/z_cluster/upgrade/test_check_pdb_post_upgrade.py similarity index 59% rename from tests/manage/z_cluster/upgrade/test_check_mon_pdb_post_upgrade.py rename to tests/manage/z_cluster/upgrade/test_check_pdb_post_upgrade.py index c6f4847fe64..50748d81c28 100644 --- a/tests/manage/z_cluster/upgrade/test_check_mon_pdb_post_upgrade.py +++ b/tests/manage/z_cluster/upgrade/test_check_pdb_post_upgrade.py @@ -4,22 +4,32 @@ from semantic_version import Version from ocs_ci.framework import config -from ocs_ci.framework.testlib import post_ocs_upgrade, ManageTest, skipif_external_mode +from ocs_ci.framework.testlib import ( + post_ocs_upgrade, + ManageTest, + skipif_external_mode, + post_ocp_upgrade, + bugzilla, +) +from ocs_ci.ocs.ocp import get_ocs_version from ocs_ci.ocs.cluster import CephCluster from ocs_ci.helpers.helpers import get_mon_pdb +from ocs_ci.ocs import constants +from ocs_ci.ocs.cluster import validate_existence_of_blocking_pdb +from ocs_ci.ocs.resources.pod import get_all_pods log = logging.getLogger(__name__) -@post_ocs_upgrade -@skipif_external_mode -@pytest.mark.polarion_id("OCS-2449") -class TestToCheckMonPDBPostUpgrade(ManageTest): +class TestToCheckPDBPostUpgrade(ManageTest): """ Validate post ocs upgrade mon pdb count """ + @post_ocs_upgrade + @skipif_external_mode + @pytest.mark.polarion_id("OCS-2449") def test_check_mon_pdb_post_upgrade(self): """ Testcase to check disruptions_allowed and minimum @@ -49,3 +59,23 @@ def test_check_mon_pdb_post_upgrade(self): assert ( max_unavailable_mon == 1 ), "Maximum unavailable mon count is not matching" + + @post_ocp_upgrade + @post_ocs_upgrade + @bugzilla("1861104") + @pytest.mark.polarion_id("OCS-2626") + def test_check_osd_pdb_post_upgrade(self): + """ + Test to verify OSD PDBs + 1. Post OCP and OCS successful upgrades check for OSD PDBs + 2. Rook-ceph-drain-canary pods disappeared after upgrade of OCS version to ocs-operator.v4.6.2-233.ci + + """ + assert ( + not validate_existence_of_blocking_pdb() + ), "Blocking PDBs present in the cluster" + + # Check for rook-ceph-canary pods + assert get_all_pods( + selector=constants.ROOK_CEPH_DRAIN_CANARY + ), f"Canary pods not found on {get_ocs_version()}"