Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/bugfix/ARTESCA-3543-missing-aler…
Browse files Browse the repository at this point in the history
…ts' into w/123.0/bugfix/ARTESCA-3543-missing-alerts
  • Loading branch information
TeddyAndrieux committed Feb 25, 2022
2 parents e265c8c + 9309687 commit 20877be
Show file tree
Hide file tree
Showing 8 changed files with 796 additions and 904 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@
(PR[#3710](https://github.com/scality/metalk8s/pull/3710))

## Release 2.11.3 (in development)
### Enhancements

- Add some missing alerts in the alerts hierarchy
(PR[#3714](https://github.com/scality/metalk8s/pull/3714))

### Bug fixes

- Fix a bug during the upgrade that makes the workload plane Ingress controller
Expand Down
12 changes: 11 additions & 1 deletion tools/lib-alert-tree/lib_alert_tree/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def pod_alerts(name, severity="warning", namespace="default"):
models.ExistingAlert(
alertname, severity=severity, namespace=namespace, pod=name
)
for alertname in ["KubePodNotReady"]
for alertname in ["KubePodNotReady", "KubePodCrashLooping"]
]


Expand Down Expand Up @@ -52,3 +52,13 @@ def statefulset_alerts(name, severity="warning", namespace="default"):
"KubeStatefulSetUpdateNotRolledOut",
]
]


def job_alerts(name, severity="warning", namespace="default"):
"""Common alerts for Jobs."""
return [
models.ExistingAlert(
alertname, severity=severity, namespace=namespace, job=name
)
for alertname in ["KubeJobCompletion", "KubeJobFailed"]
]
2 changes: 2 additions & 0 deletions tools/lib-alert-tree/metalk8s/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@
),
relationship=Relationship.ANY,
warning_children=[
Existing.warning("NodeFileDescriptorLimit"),
Existing.warning("NodeFilesystemAlmostOutOfSpace"),
Existing.warning("NodeFilesystemAlmostOutOfFiles"),
Existing.warning("NodeFilesystemFilesFillingUp"),
Existing.warning("NodeFilesystemSpaceFillingUp"),
],
critical_children=[
Existing.critical("NodeFileDescriptorLimit"),
Existing.critical("NodeFilesystemAlmostOutOfSpace"),
Existing.critical("NodeFilesystemAlmostOutOfFiles"),
Existing.critical("NodeFilesystemFilesFillingUp"),
Expand Down
1 change: 1 addition & 0 deletions tools/lib-alert-tree/metalk8s/platform/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
relationship=Relationship.ANY,
warning_children=[
Existing.warning("KubeAPIErrorBudgetBurn"),
Existing.warning("KubeAPITerminatedRequests"),
Existing.warning("etcdHighNumberOfFailedGRPCRequests"),
Existing.warning("etcdHTTPRequestsSlow"),
Existing.warning("etcdHighCommitDurations"),
Expand Down
9 changes: 9 additions & 0 deletions tools/lib-alert-tree/metalk8s/platform/observability.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
summary_name="The monitoring service",
relationship=Relationship.ANY,
warning_children=[
Existing.warning("PrometheusLabelLimitHit"),
Existing.warning("PrometheusTargetLimitHit"),
Existing.warning("PrometheusTSDBReloadsFailing"),
Existing.warning("PrometheusTSDBCompactionsFailing"),
Expand Down Expand Up @@ -47,11 +48,14 @@
),
],
critical_children=[
Existing.critical("KubeStateMetricsShardingMismatch"),
Existing.critical("KubeStateMetricsShardsMissing"),
Existing.critical("PrometheusRuleFailures"),
Existing.critical("PrometheusRemoteWriteBehind"),
Existing.critical("PrometheusRemoteStorageFailures"),
Existing.critical("PrometheusErrorSendingAlertsToAnyAlertmanager"),
Existing.critical("PrometheusBadConfig"),
Existing.critical("PrometheusTargetSyncFailure"),
],
duration="1m",
)
Expand All @@ -61,13 +65,18 @@
summary_name="The alerting service",
relationship=Relationship.ANY,
warning_children=[
Existing.warning("AlertmanagerClusterFailedToSendAlerts"),
Existing.warning("AlertmanagerFailedToSendAlerts"),
*statefulset_alerts(
"alertmanager-prometheus-operator-alertmanager",
severity="warning",
namespace="metalk8s-monitoring",
),
],
critical_children=[
Existing.critical("AlertmanagerClusterCrashlooping"),
Existing.critical("AlertmanagerClusterDown"),
Existing.critical("AlertmanagerClusterFailedToSendAlerts"),
Existing.critical("AlertmanagerConfigInconsistent"),
Existing.critical("AlertmanagerMembersInconsistent"),
Existing.critical("AlertmanagerFailedReload"),
Expand Down
21 changes: 21 additions & 0 deletions tools/lib-alert-tree/tests/test_kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def test_deployment_alerts():

def test_pod_alerts():
"""Check Pod alerts."""
# pylint: disable=line-too-long
test = D.warning(
"test",
children=kubernetes.pod_alerts("my-pod", namespace="my-ns"),
Expand All @@ -59,6 +60,7 @@ def test_pod_alerts():
== textwrap.dedent(
"""
test{severity='warning'}
├── KubePodCrashLooping{namespace=~'my-ns', pod=~'my-pod', severity='warning'}
└── KubePodNotReady{namespace=~'my-ns', pod=~'my-pod', severity='warning'}
"""
).lstrip()
Expand All @@ -84,3 +86,22 @@ def test_statefulset_alerts():
"""
).lstrip()
)


def test_job_alerts():
"""Check Job alerts."""
test = D.warning(
"test",
children=kubernetes.job_alerts("my-job", namespace="my-ns"),
relationship=Relationship.ANY,
)
assert (
test.build_tree().show(stdout=False)
== textwrap.dedent(
"""
test{severity='warning'}
├── KubeJobCompletion{job=~'my-job', namespace=~'my-ns', severity='warning'}
└── KubeJobFailed{job=~'my-job', namespace=~'my-ns', severity='warning'}
"""
).lstrip()
)
16 changes: 8 additions & 8 deletions tools/rule_extractor/alerting_rules.json

Large diffs are not rendered by default.

1,634 changes: 739 additions & 895 deletions tools/rule_extractor/rules.json

Large diffs are not rendered by default.

0 comments on commit 20877be

Please sign in to comment.