Skip to content

Commit

Permalink
Merge pull request #113 from robusta-dev/node-health
Browse files Browse the repository at this point in the history
add node_status_enricher and node_health_watcher playbooks
  • Loading branch information
aantn committed Dec 5, 2021
2 parents d381d37 + a56138f commit 9c9d9f1
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 8 deletions.
16 changes: 11 additions & 5 deletions helm/robusta/values.yaml
Expand Up @@ -32,16 +32,22 @@ customPlaybooks: []

# builtin playbooks
builtinPlaybooks:
- triggers:
- on_prometheus_alert: {}
actions:
- name_silencer:
names: ["Watchdog", "KubeSchedulerDown", "KubeControllerManagerDown"]
# playbooks for non-prometheus based monitoring
- triggers:
- on_pod_update: {}
actions:
- restart_loop_reporter:
restart_reason: CrashLoopBackOff
- triggers:
- on_node_update: {}
actions:
- node_health_watcher: {}
# playbooks for prometheus enrichment and silencing
- triggers:
- on_prometheus_alert: {}
actions:
- name_silencer:
names: ["Watchdog", "KubeSchedulerDown", "KubeControllerManagerDown"]
- triggers:
- on_prometheus_alert:
alert_name: KubeNodeNotReady
Expand Down
4 changes: 2 additions & 2 deletions playbooks/deployment_enrichments.py
@@ -1,12 +1,12 @@
from robusta.api import *


# TODO: merge with node_status_enricher?
@action
def deployment_status_enricher(event: DeploymentEvent):
deployment = event.get_deployment()
if not deployment:
logging.error(
f"cannot run DeploymentStatusEnricher on event with no deployment: {event}"
f"cannot run deployment_status_enricher on event with no deployment: {event}"
)
return

Expand Down
54 changes: 53 additions & 1 deletion playbooks/node_enrichments.py
Expand Up @@ -38,7 +38,7 @@ def node_allocatable_resources_enricher(event: NodeEvent):
node = event.get_node()
if not node:
logging.error(
f"NodeAllocatableResourcesEnricher was called on event without node : {event}"
f"node_allocatable_resources_enricher was called on event without node : {event}"
)
return

Expand All @@ -56,3 +56,55 @@ def node_allocatable_resources_enricher(event: NodeEvent):
)
)
event.add_enrichment(block_list)


# TODO: can we make this a KubernetesAnyEvent and just check that the resource has .status.condition inside the code?
# TODO: merge with deployment_status_enricher?
@action
def node_status_enricher(event: NodeEvent):
if not event.get_node():
logging.error(
f"node_status_enricher was called on event without node : {event}"
)
return

event.add_enrichment(
[
MarkdownBlock(f"*Node status details:*"),
TableBlock(
[[c.type, c.status] for c in event.get_node().status.conditions],
headers=["Type", "Status"],
),
]
)


@action
def node_health_watcher(event: NodeChangeEvent):
"""
Checks for unhealthy nodes and adds useful information when a node is unhealthy.
"""
new_condition = [c for c in event.obj.status.conditions if c.type == "Ready"]
old_condition = [c for c in event.old_obj.status.conditions if c.type == "Ready"]

if len(new_condition) != 1 or len(old_condition) != 1:
logging.warning(
f"more than one Ready condition. new={new_condition} old={old_condition}"
)

new_condition = new_condition[0]
old_condition = old_condition[0]

currently_ready = "true" in new_condition.status.lower()
previously_ready = "true" in old_condition.status.lower()

if currently_ready or currently_ready == previously_ready:
return

finding = Finding(
title=f"Unhealthy node {event.obj.metadata.name}",
source=FindingSource.KUBERNETES_API_SERVER,
aggregation_key="node_not_ready",
)
event.add_finding(finding, "DEFAULT")
node_status_enricher(event)

0 comments on commit 9c9d9f1

Please sign in to comment.