Skip to content

Commit

Permalink
[kube-prometheus-stack] Disable specific alert in values.yaml (#1173)
Browse files Browse the repository at this point in the history
* [kube-prometheus-stack] allow disable any specific alert in prometheusrules

Signed-off-by: Yves Mettier <ymettier@free.fr>

* sync rules

Signed-off-by: Yves Mettier <ymettier@free.fr>

* update helm version

Signed-off-by: Yves Mettier <ymettier@free.fr>
  • Loading branch information
ymettier committed Dec 30, 2021
1 parent f6bb3eb commit 51351ed
Show file tree
Hide file tree
Showing 22 changed files with 252 additions and 7 deletions.
2 changes: 1 addition & 1 deletion charts/kube-prometheus-stack/Chart.yaml
Expand Up @@ -17,7 +17,7 @@ name: kube-prometheus-stack
sources:
- https://github.com/prometheus-community/helm-charts
- https://github.com/prometheus-operator/kube-prometheus
version: 26.1.0
version: 26.2.0
appVersion: 0.53.1
kubeVersion: ">=1.16.0-0"
home: https://github.com/prometheus-operator/kube-prometheus
Expand Down
30 changes: 24 additions & 6 deletions charts/kube-prometheus-stack/hack/sync_prometheus_rules.py
Expand Up @@ -200,13 +200,13 @@ def yaml_str_repr(struct, indent=4):
return text


def add_rules_conditions(rules, indent=4):
"""Add if wrapper for rules, listed in alert_condition_map"""
def add_rules_conditions(rules, rules_map, indent=4):
"""Add if wrapper for rules, listed in rules_map"""
rule_condition = '{{- if %s }}\n'
for alert_name in alert_condition_map:
for alert_name in rules_map:
line_start = ' ' * indent + '- alert: '
if line_start + alert_name in rules:
rule_text = rule_condition % alert_condition_map[alert_name]
rule_text = rule_condition % rules_map[alert_name]
# add if condition
index = rules.index(line_start + alert_name)
rules = rules[:index] + rule_text + rules[index:]
Expand All @@ -217,7 +217,7 @@ def add_rules_conditions(rules, indent=4):
# we found the last alert in file if there are no alerts after it
next_index = len(rules)

# depending on the rule ordering in alert_condition_map it's possible that an if statement from another rule is present at the end of this block.
# depending on the rule ordering in rules_map it's possible that an if statement from another rule is present at the end of this block.
found_block_end = False
last_line_index = next_index
while not found_block_end:
Expand All @@ -234,6 +234,23 @@ def add_rules_conditions(rules, indent=4):
return rules


def add_rules_conditions_from_condition_map(rules, indent=4):
"""Add if wrapper for rules, listed in alert_condition_map"""
rules = add_rules_conditions(rules, alert_condition_map, indent)
return rules


def add_rules_per_rule_conditions(rules, group, indent=4):
"""Add if wrapper for rules, listed in alert_condition_map"""
rules_condition_map = {}
for rule in group['rules']:
if 'alert' in rule:
rules_condition_map[rule['alert']] = f"not (.Values.defaultRules.disabled.{rule['alert']} | default false)"

rules = add_rules_conditions(rules, rules_condition_map, indent)
return rules


def add_custom_labels(rules, indent=4):
"""Add if wrapper for additional rules labels"""
rule_condition = '{{- if .Values.defaultRules.additionalRuleLabels }}\n{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}\n{{- end }}'
Expand Down Expand Up @@ -270,7 +287,8 @@ def write_group_to_file(group, url, destination, min_kubernetes, max_kubernetes)
init_line += '\n' + replacement_map[line]['init']
# append per-alert rules
rules = add_custom_labels(rules)
rules = add_rules_conditions(rules)
rules = add_rules_conditions_from_condition_map(rules)
rules = add_rules_per_rule_conditions(rules, group)
# initialize header
lines = header % {
'name': group['name'],
Expand Down
Expand Up @@ -26,6 +26,7 @@ spec:
groups:
- name: alertmanager.rules
rules:
{{- if not (.Values.defaultRules.disabled.AlertmanagerFailedReload | default false) }}
- alert: AlertmanagerFailedReload
annotations:
description: Configuration has failed to load for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}}.
Expand All @@ -41,6 +42,8 @@ spec:
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
{{- if not (.Values.defaultRules.disabled.AlertmanagerMembersInconsistent | default false) }}
- alert: AlertmanagerMembersInconsistent
annotations:
description: Alertmanager {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} has only found {{`{{`}} $value {{`}}`}} members of the {{`{{`}}$labels.job{{`}}`}} cluster.
Expand All @@ -58,6 +61,8 @@ spec:
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
{{- if not (.Values.defaultRules.disabled.AlertmanagerFailedToSendAlerts | default false) }}
- alert: AlertmanagerFailedToSendAlerts
annotations:
description: Alertmanager {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} failed to send {{`{{`}} $value | humanizePercentage {{`}}`}} of notifications to {{`{{`}} $labels.integration {{`}}`}}.
Expand All @@ -76,6 +81,8 @@ spec:
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
{{- if not (.Values.defaultRules.disabled.AlertmanagerClusterFailedToSendAlerts | default false) }}
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
description: The minimum notification failure rate to {{`{{`}} $labels.integration {{`}}`}} sent from any instance in the {{`{{`}}$labels.job{{`}}`}} cluster is {{`{{`}} $value | humanizePercentage {{`}}`}}.
Expand All @@ -93,6 +100,7 @@ spec:
severity: critical
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
Expand All @@ -112,6 +120,7 @@ spec:
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- if not (.Values.defaultRules.disabled.AlertmanagerConfigInconsistent | default false) }}
- alert: AlertmanagerConfigInconsistent
annotations:
description: Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have different configurations.
Expand All @@ -128,6 +137,8 @@ spec:
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
{{- if not (.Values.defaultRules.disabled.AlertmanagerClusterDown | default false) }}
- alert: AlertmanagerClusterDown
annotations:
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have been up for less than half of the last 5m.'
Expand All @@ -150,6 +161,8 @@ spec:
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
{{- if not (.Values.defaultRules.disabled.AlertmanagerClusterCrashlooping | default false) }}
- alert: AlertmanagerClusterCrashlooping
annotations:
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have restarted at least 5 times in the last 10m.'
Expand All @@ -172,4 +185,5 @@ spec:
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
{{- end }}
Expand Up @@ -24,6 +24,7 @@ spec:
groups:
- name: config-reloaders
rules:
{{- if not (.Values.defaultRules.disabled.ConfigReloaderSidecarErrors | default false) }}
- alert: ConfigReloaderSidecarErrors
annotations:
description: 'Errors encountered while the {{`{{`}}$labels.pod{{`}}`}} config-reloader sidecar attempts to sync config in {{`{{`}}$labels.namespace{{`}}`}} namespace.
Expand All @@ -38,4 +39,5 @@ spec:
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
{{- end }}
Expand Up @@ -24,6 +24,7 @@ spec:
groups:
- name: etcd
rules:
{{- if not (.Values.defaultRules.disabled.etcdInsufficientMembers | default false) }}
- alert: etcdInsufficientMembers
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).'
Expand All @@ -34,6 +35,8 @@ spec:
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
{{- if not (.Values.defaultRules.disabled.etcdNoLeader | default false) }}
- alert: etcdNoLeader
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member {{`{{`}} $labels.instance {{`}}`}} has no leader.'
Expand All @@ -44,6 +47,8 @@ spec:
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
{{- if not (.Values.defaultRules.disabled.etcdHighNumberOfLeaderChanges | default false) }}
- alert: etcdHighNumberOfLeaderChanges
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": instance {{`{{`}} $labels.instance {{`}}`}} has seen {{`{{`}} $value {{`}}`}} leader changes within the last hour.'
Expand All @@ -54,6 +59,8 @@ spec:
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
{{- if not (.Values.defaultRules.disabled.etcdHighNumberOfFailedGRPCRequests | default false) }}
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
Expand All @@ -67,6 +74,7 @@ spec:
severity: warning
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
Expand All @@ -82,6 +90,7 @@ spec:
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- if not (.Values.defaultRules.disabled.etcdGRPCRequestsSlow | default false) }}
- alert: etcdGRPCRequestsSlow
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": gRPC requests to {{`{{`}} $labels.grpc_method {{`}}`}} are taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
Expand All @@ -94,6 +103,8 @@ spec:
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
{{- if not (.Values.defaultRules.disabled.etcdMemberCommunicationSlow | default false) }}
- alert: etcdMemberCommunicationSlow
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member communication with {{`{{`}} $labels.To {{`}}`}} is taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
Expand All @@ -106,6 +117,8 @@ spec:
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
{{- if not (.Values.defaultRules.disabled.etcdHighNumberOfFailedProposals | default false) }}
- alert: etcdHighNumberOfFailedProposals
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last hour on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
Expand All @@ -116,6 +129,8 @@ spec:
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
{{- if not (.Values.defaultRules.disabled.etcdHighFsyncDurations | default false) }}
- alert: etcdHighFsyncDurations
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
Expand All @@ -128,6 +143,8 @@ spec:
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
{{- if not (.Values.defaultRules.disabled.etcdHighCommitDurations | default false) }}
- alert: etcdHighCommitDurations
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile commit durations {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
Expand All @@ -140,6 +157,8 @@ spec:
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
{{- if not (.Values.defaultRules.disabled.etcdHighNumberOfFailedHTTPRequests | default false) }}
- alert: etcdHighNumberOfFailedHTTPRequests
annotations:
message: '{{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}'
Expand All @@ -151,6 +170,7 @@ spec:
severity: warning
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
- alert: etcdHighNumberOfFailedHTTPRequests
annotations:
Expand All @@ -164,6 +184,7 @@ spec:
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- if not (.Values.defaultRules.disabled.etcdHTTPRequestsSlow | default false) }}
- alert: etcdHTTPRequestsSlow
annotations:
message: etcd instance {{`{{`}} $labels.instance {{`}}`}} HTTP requests to {{`{{`}} $labels.method {{`}}`}} are slow.
Expand All @@ -176,4 +197,5 @@ spec:
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
{{- end }}
Expand Up @@ -24,6 +24,7 @@ spec:
groups:
- name: general.rules
rules:
{{- if not (.Values.defaultRules.disabled.TargetDown | default false) }}
- alert: TargetDown
annotations:
description: '{{`{{`}} printf "%.4g" $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.service {{`}}`}} targets in {{`{{`}} $labels.namespace {{`}}`}} namespace are down.'
Expand All @@ -36,6 +37,8 @@ spec:
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
{{- if not (.Values.defaultRules.disabled.Watchdog | default false) }}
- alert: Watchdog
annotations:
description: 'This is an alert meant to ensure that the entire alerting pipeline is functional.
Expand All @@ -57,4 +60,5 @@ spec:
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
{{- end }}
Expand Up @@ -24,6 +24,7 @@ spec:
groups:
- name: kube-apiserver-slos
rules:
{{- if not (.Values.defaultRules.disabled.KubeAPIErrorBudgetBurn | default false) }}
- alert: KubeAPIErrorBudgetBurn
annotations:
description: The API server is burning too much error budget.
Expand All @@ -40,6 +41,7 @@ spec:
short: 5m
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
- alert: KubeAPIErrorBudgetBurn
annotations:
Expand Down
Expand Up @@ -24,6 +24,7 @@ spec:
groups:
- name: kube-state-metrics
rules:
{{- if not (.Values.defaultRules.disabled.KubeStateMetricsListErrors | default false) }}
- alert: KubeStateMetricsListErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
Expand All @@ -40,6 +41,8 @@ spec:
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
{{- if not (.Values.defaultRules.disabled.KubeStateMetricsWatchErrors | default false) }}
- alert: KubeStateMetricsWatchErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
Expand All @@ -56,6 +59,8 @@ spec:
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
{{- if not (.Values.defaultRules.disabled.KubeStateMetricsShardingMismatch | default false) }}
- alert: KubeStateMetricsShardingMismatch
annotations:
description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.
Expand All @@ -68,6 +73,8 @@ spec:
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
{{- if not (.Values.defaultRules.disabled.KubeStateMetricsShardsMissing | default false) }}
- alert: KubeStateMetricsShardsMissing
annotations:
description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.
Expand All @@ -84,4 +91,5 @@ spec:
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
{{- end }}

0 comments on commit 51351ed

Please sign in to comment.