Skip to content

Commit

Permalink
Automates BZ OCS podDisruptionBudget prevents successful OCP upgrades
Browse files Browse the repository at this point in the history
Signed-off-by: Shrivaibavi Raghaventhiran <sraghave@redhat.com>
  • Loading branch information
Shrivaibavi committed Aug 10, 2021
1 parent c43584c commit 91db156
Show file tree
Hide file tree
Showing 5 changed files with 161 additions and 6 deletions.
41 changes: 41 additions & 0 deletions ocs_ci/ocs/cluster.py
Expand Up @@ -19,6 +19,7 @@
from ocs_ci.ocs.exceptions import UnexpectedBehaviour
from ocs_ci.ocs.resources import ocs, storage_cluster
import ocs_ci.ocs.constants as constant
from ocs_ci.ocs import defaults
from ocs_ci.ocs.resources.mcg import MCG
from ocs_ci.utility.retry import retry
from ocs_ci.utility.utils import (
Expand Down Expand Up @@ -1712,6 +1713,46 @@ def check_ceph_health_after_add_capacity(
), "Data re-balance failed to complete"


def validate_existence_of_blocking_pdb():
"""
Validate creation of PDBs for OSDs.
1. Versions lesser than ocs-operator.v4.6.2 have PDBs for each osds
2. Versions greater than or equal to ocs-operator.v4.6.2-233.ci have
PDBs collectively for osds like rook-ceph-osd
Returns:
bool: True if blocking PDBs are present, false otherwise
"""
pdb_obj = ocp.OCP(
kind=constants.POD_DISRUPTION_BUDGET, namespace=defaults.ROOK_CLUSTER_NAMESPACE
)
pdb_obj_get = pdb_obj.get()
osd_pdb = []
for pdb in pdb_obj_get.get("items"):
if (
constants.MDS_PDB not in pdb["metadata"]["name"]
and constants.MON_PDB not in pdb["metadata"]["name"]
):
osd_pdb.append(pdb["metadata"]["name"])
blocking_pdb_exist = False
if len(osd_pdb) == 1:
logger.info(f"No blocking PDBs created, OSD PDB is {osd_pdb}")
else:
logger.info(f"Blocking PDBs are created {osd_pdb}")
for osd in range(len(osd_pdb)):
allowed_disruptions = (
pdb_obj_get.get("items")[osd].get("status").get("disruptionsAllowed")
)
maximum_unavailable = (
pdb_obj_get.get("items")[osd].get("spec").get("maxUnavailable")
)
if allowed_disruptions and maximum_unavailable != 1:
logger.info("Allowed disruptions and maximum_unavailable count matches")
blocking_pdb_exist = True
return blocking_pdb_exist


class CephClusterExternal(CephCluster):
"""
Handle all external ceph cluster related functionalities
Expand Down
1 change: 1 addition & 0 deletions ocs_ci/ocs/constants.py
Expand Up @@ -268,6 +268,7 @@
CSI_CEPHFSPLUGIN_LABEL = "app=csi-cephfsplugin"
CSI_RBDPLUGIN_LABEL = "app=csi-rbdplugin"
OCS_OPERATOR_LABEL = "name=ocs-operator"
ROOK_CEPH_DRAIN_CANARY = "rook-ceph-drain-canary"
LOCAL_STORAGE_OPERATOR_LABEL = "name=local-storage-operator"
NOOBAA_APP_LABEL = "app=noobaa"
NOOBAA_CORE_POD_LABEL = "noobaa-core=noobaa"
Expand Down
5 changes: 4 additions & 1 deletion tests/ecosystem/upgrade/test_upgrade.py
Expand Up @@ -2,7 +2,10 @@

import pytest

from ocs_ci.framework.testlib import ocs_upgrade, polarion_id
from ocs_ci.framework.testlib import (
ocs_upgrade,
polarion_id,
)
from ocs_ci.ocs.disruptive_operations import worker_node_shutdown, osd_node_reboot
from ocs_ci.ocs.ocs_upgrade import run_ocs_upgrade
from ocs_ci.utility.reporting import get_polarion_id
Expand Down
84 changes: 83 additions & 1 deletion tests/manage/z_cluster/nodes/test_nodes_maintenance.py
@@ -1,10 +1,12 @@
import logging
import pytest

import time

from subprocess import TimeoutExpired

from ocs_ci.ocs.exceptions import CephHealthException, ResourceWrongStatusException
from ocs_ci.utility.utils import ceph_health_check_base
from ocs_ci.utility.utils import ceph_health_check_base, TimeoutSampler

from ocs_ci.ocs import constants, machine, ocp, defaults
from ocs_ci.ocs.node import (
Expand All @@ -17,6 +19,7 @@
get_node_objs,
add_new_node_and_label_it,
)
from ocs_ci.ocs.cluster import validate_existence_of_blocking_pdb
from ocs_ci.framework.testlib import (
tier1,
tier2,
Expand Down Expand Up @@ -433,3 +436,82 @@ def test_simultaneous_drain_of_two_ocs_nodes(

# Perform cluster and Ceph health checks
self.sanity_helpers.health_check()

def test_pdb_check_simultaneous_node_drains(
self,
pvc_factory,
pod_factory,
bucket_factory,
rgw_bucket_factory,
node_drain_teardown,
):
"""
- Check for OSD PDBs before drain
- Maintenance (mark as unschedulable and drain) 2 worker node with delay of 30 secs
- Drain will be completed on worker node A
- Drain will be pending on worker node B due to blocking PDBs
- Check the OSD PDBs
- Mark the node A as schedulable
- Let drain finish on Node B
- Mark the node B as schedulable
- Check cluster and Ceph health
"""

# Validate OSD PDBs before drain operation
assert (
not validate_existence_of_blocking_pdb()
), "Blocking PDBs exist, Can't perform drain"

# Get 2 worker nodes to drain
typed_nodes = get_nodes(num_of_nodes=2)
assert len(typed_nodes) == 2, "Failed to find worker nodes for the test"
node_A = typed_nodes[0].name
node_B = typed_nodes[1].name

# Drain Node A and validate blocking PDBs
drain_nodes([node_A])
assert (
validate_existence_of_blocking_pdb()
), "Blocking PDBs not created post drain"

# Inducing delay between 2 drains
# Node-B drain expected to be in pending due to blocking PDBs
time.sleep(30)
try:
drain_nodes([node_B])
except TimeoutExpired:
# Mark the node-A back to schedulable and let drain finish in Node-B
schedule_nodes([node_A])

time.sleep(40)

# Validate OSD PDBs
assert (
validate_existence_of_blocking_pdb()
), "Blocking PDBs not created post second drain"

# Mark the node-B back to schedulable and recover the cluster
schedule_nodes([node_B])

sample = TimeoutSampler(
timeout=100,
sleep=10,
func=validate_existence_of_blocking_pdb,
)
if not sample.wait_for_func_status(result=False):
log.error("Blocking PDBs still exist")

# wait for storage pods
pod.wait_for_storage_pods()

# Perform cluster and Ceph health checks
self.sanity_helpers.health_check(tries=50)

# Check basic cluster functionality by creating resources
# (pools, storageclasses, PVCs, pods - both CephFS and RBD),
# run IO and delete the resources
self.sanity_helpers.create_resources(
pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory
)
self.sanity_helpers.delete_resources()
36 changes: 32 additions & 4 deletions tests/manage/z_cluster/upgrade/test_check_mon_pdb_post_upgrade.py
Expand Up @@ -4,22 +4,31 @@
from semantic_version import Version

from ocs_ci.framework import config
from ocs_ci.framework.testlib import post_ocs_upgrade, ManageTest, skipif_external_mode
from ocs_ci.framework.testlib import (
post_ocs_upgrade,
ManageTest,
skipif_external_mode,
post_ocp_upgrade,
)
from ocs_ci.ocs.ocp import get_ocs_version
from ocs_ci.ocs.cluster import CephCluster
from ocs_ci.helpers.helpers import get_mon_pdb
from ocs_ci.ocs import constants
from ocs_ci.ocs.cluster import validate_existence_of_blocking_pdb
from ocs_ci.ocs.resources.pod import get_all_pods

log = logging.getLogger(__name__)


@post_ocs_upgrade
@skipif_external_mode
@pytest.mark.polarion_id("OCS-2449")
class TestToCheckMonPDBPostUpgrade(ManageTest):
"""
Validate post ocs upgrade mon pdb count
"""

@post_ocs_upgrade
@skipif_external_mode
@pytest.mark.polarion_id("OCS-2449")
def test_check_mon_pdb_post_upgrade(self):
"""
Testcase to check disruptions_allowed and minimum
Expand Down Expand Up @@ -49,3 +58,22 @@ def test_check_mon_pdb_post_upgrade(self):
assert (
max_unavailable_mon == 1
), "Maximum unavailable mon count is not matching"

@post_ocp_upgrade
@post_ocs_upgrade
@pytest.mark.polarion_id("OCS-2524")
def test_check_osd_pdb_post_upgrade(self):
"""
Test to verify OSD PDBs
1. Post OCP and OCS successful upgrades check for OSD PDBs
2. Rook-ceph-drain-canary pods disappeared after upgrade of OCS version to ocs-operator.v4.6.2-233.ci
"""
assert (
not validate_existence_of_blocking_pdb()
), "Blocking PDBs present in the cluster"

# Check for rook-ceph-canary pods
assert get_all_pods(
selector=constants.ROOK_CEPH_DRAIN_CANARY
), f"Canary pods not found on {get_ocs_version()}"

0 comments on commit 91db156

Please sign in to comment.