Skip to content

Commit

Permalink
[kube-prometheus-stack] Allow to configure keep_firing_for for all Pr…
Browse files Browse the repository at this point in the history
…ometheus rules (#3890)

* [kube-prometheus-stack] Allow to configure keep_firing_for for all Prometheus rules

Signed-off-by: Jan-Otto Kröpke <joe@cloudeteer.de>

* s/keep_firing_fore/keep_firing_for

Signed-off-by: Jan-Otto Kröpke <joe@cloudeteer.de>

---------

Signed-off-by: Jan-Otto Kröpke <joe@cloudeteer.de>
  • Loading branch information
jkroepke committed Oct 13, 2023
1 parent 9d3a0d6 commit dbeb5cd
Show file tree
Hide file tree
Showing 22 changed files with 420 additions and 1 deletion.
2 changes: 1 addition & 1 deletion charts/kube-prometheus-stack/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ name: kube-prometheus-stack
sources:
- https://github.com/prometheus-community/helm-charts
- https://github.com/prometheus-operator/kube-prometheus
version: 51.6.1
version: 51.7.0
appVersion: v0.68.0
kubeVersion: ">=1.19.0-0"
home: https://github.com/prometheus-operator/kube-prometheus
Expand Down
23 changes: 23 additions & 0 deletions charts/kube-prometheus-stack/hack/sync_prometheus_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,28 @@ def add_custom_annotations(rules, group, indent=4):

return rules


def add_custom_keep_firing_for(rules, indent=4):
"""Add if wrapper for additional rules annotations"""
indent_spaces = " " * indent + " "
keep_firing_for = (indent_spaces + '{{- with .Values.defaultRules.keepFiringFor }}\n' +
indent_spaces + 'keep_firing_for: "{{ . }}"\n' +
indent_spaces + '{{- end }}')
keep_firing_for_len = len(keep_firing_for) + 1

separator = " " * indent + " for:.*"
alerts_positions = re.finditer(separator, rules)
alert = 0

for alert_position in alerts_positions:
# Add rule_condition after 'annotations:' statement
index = alert_position.end() + keep_firing_for_len * alert
rules = rules[:index] + "\n" + keep_firing_for + rules[index:]
alert += 1

return rules


def write_group_to_file(group, url, destination, min_kubernetes, max_kubernetes):
fix_expr(group['rules'])
group_name = group['name']
Expand All @@ -374,6 +396,7 @@ def write_group_to_file(group, url, destination, min_kubernetes, max_kubernetes)
# append per-alert rules
rules = add_custom_labels(rules, group)
rules = add_custom_annotations(rules, group)
rules = add_custom_keep_firing_for(rules)
rules = add_rules_conditions_from_condition_map(rules)
rules = add_rules_per_rule_conditions(rules, group)
# initialize header
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ spec:
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) == 0
for: 10m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
Expand Down Expand Up @@ -73,6 +76,9 @@ spec:
< on (namespace,service,cluster) group_left
count by (namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]))
for: 15m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
Expand Down Expand Up @@ -104,6 +110,9 @@ spec:
)
> 0.01
for: 5m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: warning
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
Expand Down Expand Up @@ -135,6 +144,9 @@ spec:
)
> 0.01
for: 5m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
Expand Down Expand Up @@ -166,6 +178,9 @@ spec:
)
> 0.01
for: 5m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: warning
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
Expand Down Expand Up @@ -195,6 +210,9 @@ spec:
)
!= 1
for: 20m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
Expand Down Expand Up @@ -230,6 +248,9 @@ spec:
)
>= 0.5
for: 5m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
Expand Down Expand Up @@ -265,6 +286,9 @@ spec:
)
>= 0.5
for: 5m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.alertmanager }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ spec:
summary: config-reloader sidecar has not had a successful reload for 10m
expr: max_over_time(reloader_last_reload_successful{namespace=~".+"}[5m]) == 0
for: 10m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: warning
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.configReloaders }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ spec:
)
> 0
for: 10m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
Expand All @@ -69,6 +72,9 @@ spec:
summary: etcd cluster has insufficient number of members.
expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
for: 3m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
Expand All @@ -93,6 +99,9 @@ spec:
summary: etcd cluster has no leader.
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
for: 1m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
Expand All @@ -117,6 +126,9 @@ spec:
summary: etcd cluster has high number of leader changes.
expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
for: 5m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: warning
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
Expand Down Expand Up @@ -145,6 +157,9 @@ spec:
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
> 1
for: 10m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: warning
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
Expand Down Expand Up @@ -173,6 +188,9 @@ spec:
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
> 5
for: 5m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
Expand All @@ -199,6 +217,9 @@ spec:
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
> 0.15
for: 10m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
Expand All @@ -225,6 +246,9 @@ spec:
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.15
for: 10m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: warning
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
Expand All @@ -249,6 +273,9 @@ spec:
summary: etcd cluster has high number of proposal failures.
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
for: 15m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: warning
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
Expand All @@ -275,6 +302,9 @@ spec:
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.5
for: 10m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: warning
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
Expand All @@ -301,6 +331,9 @@ spec:
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 1
for: 10m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
Expand All @@ -327,6 +360,9 @@ spec:
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.25
for: 10m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: warning
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
Expand All @@ -351,6 +387,9 @@ spec:
summary: etcd cluster database is running full.
expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 95
for: 10m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
Expand All @@ -375,6 +414,9 @@ spec:
summary: etcd cluster database growing very fast.
expr: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60) > etcd_server_quota_backend_bytes{job=~".*etcd.*"}
for: 10m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: warning
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
Expand All @@ -400,6 +442,9 @@ spec:
summary: etcd database size in use is less than 50% of the actual allocated storage.
expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5 and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600
for: 10m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: warning
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.etcd }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ spec:
summary: One or more targets are unreachable.
expr: 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up) BY (cluster, job, namespace, service)) > 10
for: 10m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: warning
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.general }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ spec:
and
sum(apiserver_request:burnrate5m) > (14.40 * 0.01000)
for: 2m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
long: 1h
severity: critical
Expand Down Expand Up @@ -71,6 +74,9 @@ spec:
and
sum(apiserver_request:burnrate30m) > (6.00 * 0.01000)
for: 15m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
long: 6h
severity: critical
Expand Down Expand Up @@ -101,6 +107,9 @@ spec:
and
sum(apiserver_request:burnrate2h) > (3.00 * 0.01000)
for: 1h
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
long: 1d
severity: warning
Expand Down Expand Up @@ -131,6 +140,9 @@ spec:
and
sum(apiserver_request:burnrate6h) > (1.00 * 0.01000)
for: 3h
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
long: 3d
severity: warning
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ spec:
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by (cluster))
> 0.01
for: 15m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeStateMetrics }}
Expand Down Expand Up @@ -71,6 +74,9 @@ spec:
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by (cluster))
> 0.01
for: 15m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeStateMetrics }}
Expand All @@ -96,6 +102,9 @@ spec:
summary: kube-state-metrics sharding is misconfigured.
expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) != 0
for: 15m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeStateMetrics }}
Expand Down Expand Up @@ -125,6 +134,9 @@ spec:
sum( 2 ^ max by (cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by (cluster)
!= 0
for: 15m
{{- with .Values.defaultRules.keepFiringFor }}
keep_firing_for: "{{ . }}"
{{- end }}
labels:
severity: critical
{{- if or .Values.defaultRules.additionalRuleLabels .Values.defaultRules.additionalRuleGroupLabels.kubeStateMetrics }}
Expand Down

0 comments on commit dbeb5cd

Please sign in to comment.