Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automates BZ OCS podDisruptionBudget prevents successful OCP upgrades #3888

Merged
merged 4 commits into from Sep 2, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
37 changes: 37 additions & 0 deletions ocs_ci/ocs/cluster.py
Expand Up @@ -19,6 +19,7 @@
from ocs_ci.ocs.exceptions import UnexpectedBehaviour
from ocs_ci.ocs.resources import ocs, storage_cluster
import ocs_ci.ocs.constants as constant
from ocs_ci.ocs import defaults
from ocs_ci.ocs.resources.mcg import MCG
from ocs_ci.utility.retry import retry
from ocs_ci.utility.utils import (
Expand Down Expand Up @@ -1712,6 +1713,42 @@ def check_ceph_health_after_add_capacity(
), "Data re-balance failed to complete"


Shrivaibavi marked this conversation as resolved.
Show resolved Hide resolved
def validate_existence_of_blocking_pdb():
"""
Validate creation of PDBs for OSDs.
1. Versions lesser than ocs-operator.v4.6.2 have PDBs for each osds
2. Versions greater than or equal to ocs-operator.v4.6.2-233.ci have
PDBs collectively for osds like rook-ceph-osd

Returns:
bool: True if blocking PDBs are present, false otherwise

"""
pdb_obj = ocp.OCP(
kind=constants.POD_DISRUPTION_BUDGET, namespace=defaults.ROOK_CLUSTER_NAMESPACE
)
pdb_obj_get = pdb_obj.get()
Shrivaibavi marked this conversation as resolved.
Show resolved Hide resolved
osd_pdb = []
for pdb in pdb_obj_get.get("items"):
if not any(
osd in pdb["metadata"]["name"]
for osd in [constants.MDS_PDB, constants.MON_PDB]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we have osd pdb as well

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We get only OSD PDBs from the list of PDBs so we exclude mons and mds PDBs

):
osd_pdb.append(pdb)
blocking_pdb_exist = False
for osd in range(len(osd_pdb)):
allowed_disruptions = osd_pdb[osd].get("status").get("disruptionsAllowed")
maximum_unavailable = osd_pdb[osd].get("spec").get("maxUnavailable")
Comment on lines +1740 to +1741
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if it might happen, but if status or spec key will not be available, it will raise exception AttributeError: 'NoneType' object has no attribute 'get'.
If it is a valid scenario (that those keys might not be present) and we want to get some default value (None in this case), it is better to provide empty dict ({}) as default value for the get(...) method:

Suggested change
allowed_disruptions = osd_pdb[osd].get("status").get("disruptionsAllowed")
maximum_unavailable = osd_pdb[osd].get("spec").get("maxUnavailable")
allowed_disruptions = osd_pdb[osd].get("status", {}).get("disruptionsAllowed")
maximum_unavailable = osd_pdb[osd].get("spec", {}).get("maxUnavailable")

Or if this keys status and spec should be always present, you might use this way:

Suggested change
allowed_disruptions = osd_pdb[osd].get("status").get("disruptionsAllowed")
maximum_unavailable = osd_pdb[osd].get("spec").get("maxUnavailable")
allowed_disruptions = osd_pdb[osd]["status"].get("disruptionsAllowed")
maximum_unavailable = osd_pdb[osd]["spec"].get("maxUnavailable")

And if it will for some reason happen, that status or spec key are not present, it will raise KeyError: 'status' exception, which might be slightly more clear, than the AttributeError: 'NoneType' object has no attribute 'get' exception mentioned above.

if allowed_disruptions & maximum_unavailable != 1:
logger.info("Blocking PDBs are created")
blocking_pdb_exist = True
else:
logger.info(
f"No blocking PDBs created, OSD PDB is {osd_pdb[osd].get('metadata').get('name')}"
)
return blocking_pdb_exist


class CephClusterExternal(CephCluster):
"""
Handle all external ceph cluster related functionalities
Expand Down
1 change: 1 addition & 0 deletions ocs_ci/ocs/constants.py
Expand Up @@ -270,6 +270,7 @@
CSI_CEPHFSPLUGIN_LABEL = "app=csi-cephfsplugin"
CSI_RBDPLUGIN_LABEL = "app=csi-rbdplugin"
OCS_OPERATOR_LABEL = "name=ocs-operator"
ROOK_CEPH_DRAIN_CANARY = "rook-ceph-drain-canary"
LOCAL_STORAGE_OPERATOR_LABEL = "name=local-storage-operator"
NOOBAA_APP_LABEL = "app=noobaa"
NOOBAA_CORE_POD_LABEL = "noobaa-core=noobaa"
Expand Down
5 changes: 4 additions & 1 deletion tests/ecosystem/upgrade/test_upgrade.py
Expand Up @@ -2,7 +2,10 @@

import pytest

from ocs_ci.framework.testlib import ocs_upgrade, polarion_id
from ocs_ci.framework.testlib import (
ocs_upgrade,
polarion_id,
)
from ocs_ci.ocs.disruptive_operations import worker_node_shutdown, osd_node_reboot
from ocs_ci.ocs.ocs_upgrade import run_ocs_upgrade
from ocs_ci.utility.reporting import get_polarion_id
Expand Down
88 changes: 87 additions & 1 deletion tests/manage/z_cluster/nodes/test_nodes_maintenance.py
@@ -1,10 +1,12 @@
import logging
import pytest

import time

from subprocess import TimeoutExpired

from ocs_ci.ocs.exceptions import CephHealthException, ResourceWrongStatusException
from ocs_ci.utility.utils import ceph_health_check_base
from ocs_ci.utility.utils import ceph_health_check_base, TimeoutSampler

from ocs_ci.ocs import constants, machine, ocp, defaults
from ocs_ci.ocs.node import (
Expand All @@ -17,6 +19,7 @@
get_node_objs,
add_new_node_and_label_it,
)
from ocs_ci.ocs.cluster import validate_existence_of_blocking_pdb
from ocs_ci.framework.testlib import (
tier1,
tier2,
Expand All @@ -28,6 +31,7 @@
ignore_leftovers,
ipi_deployment_required,
skipif_bm,
bugzilla,
)
from ocs_ci.helpers.sanity_helpers import Sanity, SanityExternalCluster
from ocs_ci.ocs.resources import pod
Expand Down Expand Up @@ -433,3 +437,85 @@ def test_simultaneous_drain_of_two_ocs_nodes(

# Perform cluster and Ceph health checks
self.sanity_helpers.health_check()

PrasadDesala marked this conversation as resolved.
Show resolved Hide resolved
@bugzilla("1861104")
@pytest.mark.polarion_id("OCS-2524")
@tier4b
def test_pdb_check_simultaneous_node_drains(
self,
pvc_factory,
pod_factory,
bucket_factory,
rgw_bucket_factory,
node_drain_teardown,
):
"""
- Check for OSD PDBs before drain
- Maintenance (mark as unschedulable and drain) 2 worker node with delay of 30 secs
- Drain will be completed on worker node A
- Drain will be pending on worker node B due to blocking PDBs
- Check the OSD PDBs
- Mark the node A as schedulable
- Let drain finish on Node B
- Mark the node B as schedulable
- Check cluster and Ceph health

"""

# Validate OSD PDBs before drain operation
assert (
not validate_existence_of_blocking_pdb()
), "Blocking PDBs exist, Can't perform drain"

# Get 2 worker nodes to drain
typed_nodes = get_nodes(num_of_nodes=2)
assert len(typed_nodes) == 2, "Failed to find worker nodes for the test"
node_A = typed_nodes[0].name
node_B = typed_nodes[1].name

# Drain Node A and validate blocking PDBs
drain_nodes([node_A])
assert (
validate_existence_of_blocking_pdb()
), "Blocking PDBs not created post drain"

# Inducing delay between 2 drains
# Node-B drain expected to be in pending due to blocking PDBs
time.sleep(30)
try:
drain_nodes([node_B])
except TimeoutExpired:
# Mark the node-A back to schedulable and let drain finish in Node-B
schedule_nodes([node_A])

time.sleep(40)

# Validate OSD PDBs
assert (
validate_existence_of_blocking_pdb()
), "Blocking PDBs not created post second drain"

# Mark the node-B back to schedulable and recover the cluster
schedule_nodes([node_B])

sample = TimeoutSampler(
timeout=100,
sleep=10,
func=validate_existence_of_blocking_pdb,
)
if not sample.wait_for_func_status(result=False):
log.error("Blocking PDBs still exist")

# wait for storage pods
pod.wait_for_storage_pods()

# Perform cluster and Ceph health checks
self.sanity_helpers.health_check(tries=50)

# Check basic cluster functionality by creating resources
# (pools, storageclasses, PVCs, pods - both CephFS and RBD),
# run IO and delete the resources
self.sanity_helpers.create_resources(
pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory
)
self.sanity_helpers.delete_resources()
Expand Up @@ -4,22 +4,32 @@
from semantic_version import Version

from ocs_ci.framework import config
from ocs_ci.framework.testlib import post_ocs_upgrade, ManageTest, skipif_external_mode
from ocs_ci.framework.testlib import (
post_ocs_upgrade,
ManageTest,
skipif_external_mode,
post_ocp_upgrade,
bugzilla,
)
from ocs_ci.ocs.ocp import get_ocs_version
from ocs_ci.ocs.cluster import CephCluster
from ocs_ci.helpers.helpers import get_mon_pdb
from ocs_ci.ocs import constants
from ocs_ci.ocs.cluster import validate_existence_of_blocking_pdb
from ocs_ci.ocs.resources.pod import get_all_pods

log = logging.getLogger(__name__)


@post_ocs_upgrade
@skipif_external_mode
@pytest.mark.polarion_id("OCS-2449")
class TestToCheckMonPDBPostUpgrade(ManageTest):
class TestToCheckPDBPostUpgrade(ManageTest):
"""
Validate post ocs upgrade mon pdb count

"""

@post_ocs_upgrade
@skipif_external_mode
@pytest.mark.polarion_id("OCS-2449")
def test_check_mon_pdb_post_upgrade(self):
"""
Testcase to check disruptions_allowed and minimum
Expand Down Expand Up @@ -49,3 +59,23 @@ def test_check_mon_pdb_post_upgrade(self):
assert (
max_unavailable_mon == 1
), "Maximum unavailable mon count is not matching"

Shrivaibavi marked this conversation as resolved.
Show resolved Hide resolved
@post_ocp_upgrade
@post_ocs_upgrade
@bugzilla("1861104")
@pytest.mark.polarion_id("OCS-2626")
def test_check_osd_pdb_post_upgrade(self):
"""
Test to verify OSD PDBs
1. Post OCP and OCS successful upgrades check for OSD PDBs
2. Rook-ceph-drain-canary pods disappeared after upgrade of OCS version to ocs-operator.v4.6.2-233.ci

"""
assert (
not validate_existence_of_blocking_pdb()
), "Blocking PDBs present in the cluster"

# Check for rook-ceph-canary pods
assert get_all_pods(
selector=constants.ROOK_CEPH_DRAIN_CANARY
), f"Canary pods not found on {get_ocs_version()}"