Skip to content

Commit

Permalink
Merge branch 'improvement/ARTESCA-4300_OOMKill_alert' into q/124.1
Browse files Browse the repository at this point in the history
  • Loading branch information
bert-e committed May 22, 2023
2 parents f1e2939 + 0bba23d commit 0c71b05
Show file tree
Hide file tree
Showing 7 changed files with 2,111 additions and 1,925 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@

## Release 124.1.6

### Enhancements

- Add Out Of Memory alerts for containers
(PR[#4042](https://github.com/scality/metalk8s/pull/4042))
### Bug fixes

- Ensure metalk8s-sosreport package get upgraded on each patch version.
Expand Down
9 changes: 8 additions & 1 deletion buildchain/buildchain/salt_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,14 @@ def task(self) -> types.TaskDict:
Path("salt/metalk8s/addons/prometheus-operator/deployed/grafana-ini-configmap.sls"),
Path("salt/metalk8s/addons/prometheus-operator/deployed/init.sls"),
Path("salt/metalk8s/addons/prometheus-operator/deployed/namespace.sls"),
Path("salt/metalk8s/addons/prometheus-operator/deployed/prometheus-rules.sls"),
Path(
"salt/metalk8s/addons/prometheus-operator/deployed/",
"node-alerts-rules.sls",
),
Path(
"salt/metalk8s/addons/prometheus-operator/deployed/",
"container-alerts-rules.sls",
),
Path("salt/metalk8s/addons/prometheus-operator/deployed/service-configuration.sls"),
Path("salt/metalk8s/addons/prometheus-operator/deployed/thanos-chart.sls"),
Path("salt/metalk8s/addons/ui/deployed/dependencies.sls"),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!jinja | metalk8s_kubernetes

{%- from "metalk8s/repo/macro.sls" import build_image_name with context %}
{%- set prometheus_defaults = salt.slsutil.renderer(
'salt://metalk8s/addons/prometheus-operator/config/prometheus.yaml',
saltenv=saltenv
)
%}
{%- set prometheus = salt.metalk8s_service_configuration.get_service_conf(
'metalk8s-monitoring', 'metalk8s-prometheus-config', prometheus_defaults
)
%}
{%- set rules = prometheus.get('spec', {}).get('rules', {}) %}
{%- raw %}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/part-of: metalk8s
metalk8s.scality.com/monitor: ''
name: metalk8s-containers.rules
namespace: metalk8s-monitoring
spec:
groups:
- name: container-exporter
rules:
- alert: KubeContainerOOMKilled
annotations:
description: 'Container {{ $labels.namespace }}/{{ $labels.pod }}/{{ $labels.container }}
on Node {{ $labels.node }} exceeds its memory allocation limit (OOMKilled). Pod accumulates
a total of {{ printf "%.0f" $value }} restart(s).'
summary: 'Container killed by exceeding memory allocation limits (OOMKilled).'
expr: |-
max_over_time(
(
kube_pod_container_status_last_terminated_reason{job="kube-state-metrics", namespace=~".*", reason="OOMKilled"}
* on (namespace, pod, container) kube_pod_container_status_restarts_total
* on (namespace, pod) group_left(node) kube_pod_info
) [3d:]
) > 0
labels:
severity: warning
- alert: KubeContainerOOMKillSurge
annotations:
description: 'OOMKill Surge: Container {{ $labels.namespace }}/{{ $labels.pod }}/{{ $labels.container }}
on Node {{ $labels.node }} exceeds its memory allocation limit (OOMKilled). Pod restarted
{{ printf "%.0f" $value }} time(s) in last hour.'
summary: 'OOMKill Surge: Container killed by exceeding memory allocation limits (OOMKilled) in last hour.'
expr: |-
round(
delta(
(
kube_pod_container_status_last_terminated_reason{job="kube-state-metrics", namespace=~".*", reason="OOMKilled"}
* on (namespace, pod, container) kube_pod_container_status_restarts_total
* on (namespace, pod) group_left(node) kube_pod_info
) [61m:]
), 1
) > 0
labels:
severity: critical
{% endraw %}
3 changes: 2 additions & 1 deletion salt/metalk8s/addons/prometheus-operator/deployed/init.sls
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,6 @@ include:
- .dashboards
- .service-configuration
- .chart
- .prometheus-rules
- .node-alerts-rules
- .container-alerts-rules
- .thanos-chart
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,9 @@ apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app: prometheus-operator
app.kubernetes.io/managed-by: salt
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/part-of: metalk8s
chart: prometheus-operator-8.1.2
metalk8s.scality.com/monitor: ''
heritage: metalk8s
name: prometheus-operator-node-exporter
name: metalk8s-nodes.rules
namespace: metalk8s-monitoring
spec:
groups:
Expand Down
228 changes: 120 additions & 108 deletions tools/rule_extractor/alerting_rules.json
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,126 @@
"query": "sum by(persistentvolumeclaim, namespace, instance) (ALERTS{alertname=\"KubePersistentVolumeFillingUp\",alertstate=\"firing\",severity=\"warning\"}) >= 1",
"severity": "warning"
},
{
"message": "OOMKill Surge: Container killed by exceeding memory allocation limits (OOMKilled) in last hour.",
"name": "KubeContainerOOMKillSurge",
"query": "round(delta((kube_pod_container_status_last_terminated_reason{job=\"kube-state-metrics\",namespace=~\".*\",reason=\"OOMKilled\"} * on(namespace, pod, container) kube_pod_container_status_restarts_total * on(namespace, pod) group_left(node) kube_pod_info)[1h1m:]), 1) > 0",
"severity": "critical"
},
{
"message": "Container killed by exceeding memory allocation limits (OOMKilled).",
"name": "KubeContainerOOMKilled",
"query": "max_over_time((kube_pod_container_status_last_terminated_reason{job=\"kube-state-metrics\",namespace=~\".*\",reason=\"OOMKilled\"} * on(namespace, pod, container) kube_pod_container_status_restarts_total * on(namespace, pod) group_left(node) kube_pod_info)[3d:]) > 0",
"severity": "warning"
},
{
"message": "Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.",
"name": "NodeClockNotSynchronising",
"query": "min_over_time(node_timex_sync_status[5m]) == 0 and node_timex_maxerror_seconds >= 16",
"severity": "warning"
},
{
"message": "Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.",
"name": "NodeClockSkewDetected",
"query": "(node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)",
"severity": "warning"
},
{
"message": "Kernel is predicted to exhaust file descriptors limit soon.",
"name": "NodeFileDescriptorLimit",
"query": "(node_filefd_allocated{job=\"node-exporter\"} * 100 / node_filefd_maximum{job=\"node-exporter\"} > 90)",
"severity": "critical"
},
{
"message": "Kernel is predicted to exhaust file descriptors limit soon.",
"name": "NodeFileDescriptorLimit",
"query": "(node_filefd_allocated{job=\"node-exporter\"} * 100 / node_filefd_maximum{job=\"node-exporter\"} > 70)",
"severity": "warning"
},
{
"message": "Filesystem has less than 8% inodes left.",
"name": "NodeFilesystemAlmostOutOfFiles",
"query": "(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_files{fstype!=\"\",job=\"node-exporter\"} * 100 < 8 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)",
"severity": "critical"
},
{
"message": "Filesystem has less than 15% inodes left.",
"name": "NodeFilesystemAlmostOutOfFiles",
"query": "(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_files{fstype!=\"\",job=\"node-exporter\"} * 100 < 15 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)",
"severity": "warning"
},
{
"message": "Filesystem has less than 12% space left.",
"name": "NodeFilesystemAlmostOutOfSpace",
"query": "(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_size_bytes{fstype!=\"\",job=\"node-exporter\"} * 100 < 12 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)",
"severity": "critical"
},
{
"message": "Filesystem has less than 20% space left.",
"name": "NodeFilesystemAlmostOutOfSpace",
"query": "(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_size_bytes{fstype!=\"\",job=\"node-exporter\"} * 100 < 20 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)",
"severity": "warning"
},
{
"message": "Filesystem is predicted to run out of inodes within the next 4 hours.",
"name": "NodeFilesystemFilesFillingUp",
"query": "(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_files{fstype!=\"\",job=\"node-exporter\"} * 100 < 20 and predict_linear(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"}[6h], 4 * 60 * 60) < 0 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)",
"severity": "critical"
},
{
"message": "Filesystem is predicted to run out of inodes within the next 24 hours.",
"name": "NodeFilesystemFilesFillingUp",
"query": "(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_files{fstype!=\"\",job=\"node-exporter\"} * 100 < 40 and predict_linear(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"}[6h], 24 * 60 * 60) < 0 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)",
"severity": "warning"
},
{
"message": "Filesystem is predicted to run out of space within the next 4 hours.",
"name": "NodeFilesystemSpaceFillingUp",
"query": "(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_size_bytes{fstype!=\"\",job=\"node-exporter\"} * 100 < 20 and predict_linear(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"}[6h], 4 * 60 * 60) < 0 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)",
"severity": "critical"
},
{
"message": "Filesystem is predicted to run out of space within the next 24 hours.",
"name": "NodeFilesystemSpaceFillingUp",
"query": "(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_size_bytes{fstype!=\"\",job=\"node-exporter\"} * 100 < 40 and predict_linear(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"}[6h], 24 * 60 * 60) < 0 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)",
"severity": "warning"
},
{
"message": "Number of conntrack are getting close to the limit",
"name": "NodeHighNumberConntrackEntriesUsed",
"query": "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75",
"severity": "warning"
},
{
"message": "Network interface is reporting many receive errors.",
"name": "NodeNetworkReceiveErrs",
"query": "increase(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01",
"severity": "warning"
},
{
"message": "Network interface is reporting many transmit errors.",
"name": "NodeNetworkTransmitErrs",
"query": "increase(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01",
"severity": "warning"
},
{
"message": "RAID Array is degraded",
"name": "NodeRAIDDegraded",
"query": "node_md_disks_required - ignoring(state) (node_md_disks{state=\"active\"}) >= 1",
"severity": "critical"
},
{
"message": "Failed device in RAID array",
"name": "NodeRAIDDiskFailure",
"query": "node_md_disks{state=\"failed\"} >= 1",
"severity": "warning"
},
{
"message": "Node Exporter text file collector failed to scrape.",
"name": "NodeTextFileCollectorScrapeError",
"query": "node_textfile_scrape_error{job=\"node-exporter\"} == 1",
"severity": "warning"
},
{
"message": "Half or more of the Alertmanager instances within the same cluster are crashlooping.",
"name": "AlertmanagerClusterCrashlooping",
Expand Down Expand Up @@ -671,114 +791,6 @@
"query": "absent(up{job=\"kube-scheduler\"} == 1)",
"severity": "critical"
},
{
"message": "Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.",
"name": "NodeClockNotSynchronising",
"query": "min_over_time(node_timex_sync_status[5m]) == 0 and node_timex_maxerror_seconds >= 16",
"severity": "warning"
},
{
"message": "Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.",
"name": "NodeClockSkewDetected",
"query": "(node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)",
"severity": "warning"
},
{
"message": "Kernel is predicted to exhaust file descriptors limit soon.",
"name": "NodeFileDescriptorLimit",
"query": "(node_filefd_allocated{job=\"node-exporter\"} * 100 / node_filefd_maximum{job=\"node-exporter\"} > 90)",
"severity": "critical"
},
{
"message": "Kernel is predicted to exhaust file descriptors limit soon.",
"name": "NodeFileDescriptorLimit",
"query": "(node_filefd_allocated{job=\"node-exporter\"} * 100 / node_filefd_maximum{job=\"node-exporter\"} > 70)",
"severity": "warning"
},
{
"message": "Filesystem has less than 8% inodes left.",
"name": "NodeFilesystemAlmostOutOfFiles",
"query": "(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_files{fstype!=\"\",job=\"node-exporter\"} * 100 < 8 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)",
"severity": "critical"
},
{
"message": "Filesystem has less than 15% inodes left.",
"name": "NodeFilesystemAlmostOutOfFiles",
"query": "(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_files{fstype!=\"\",job=\"node-exporter\"} * 100 < 15 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)",
"severity": "warning"
},
{
"message": "Filesystem has less than 12% space left.",
"name": "NodeFilesystemAlmostOutOfSpace",
"query": "(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_size_bytes{fstype!=\"\",job=\"node-exporter\"} * 100 < 12 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)",
"severity": "critical"
},
{
"message": "Filesystem has less than 20% space left.",
"name": "NodeFilesystemAlmostOutOfSpace",
"query": "(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_size_bytes{fstype!=\"\",job=\"node-exporter\"} * 100 < 20 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)",
"severity": "warning"
},
{
"message": "Filesystem is predicted to run out of inodes within the next 4 hours.",
"name": "NodeFilesystemFilesFillingUp",
"query": "(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_files{fstype!=\"\",job=\"node-exporter\"} * 100 < 20 and predict_linear(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"}[6h], 4 * 60 * 60) < 0 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)",
"severity": "critical"
},
{
"message": "Filesystem is predicted to run out of inodes within the next 24 hours.",
"name": "NodeFilesystemFilesFillingUp",
"query": "(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_files{fstype!=\"\",job=\"node-exporter\"} * 100 < 40 and predict_linear(node_filesystem_files_free{fstype!=\"\",job=\"node-exporter\"}[6h], 24 * 60 * 60) < 0 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)",
"severity": "warning"
},
{
"message": "Filesystem is predicted to run out of space within the next 4 hours.",
"name": "NodeFilesystemSpaceFillingUp",
"query": "(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_size_bytes{fstype!=\"\",job=\"node-exporter\"} * 100 < 20 and predict_linear(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"}[6h], 4 * 60 * 60) < 0 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)",
"severity": "critical"
},
{
"message": "Filesystem is predicted to run out of space within the next 24 hours.",
"name": "NodeFilesystemSpaceFillingUp",
"query": "(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"} / node_filesystem_size_bytes{fstype!=\"\",job=\"node-exporter\"} * 100 < 40 and predict_linear(node_filesystem_avail_bytes{fstype!=\"\",job=\"node-exporter\"}[6h], 24 * 60 * 60) < 0 and node_filesystem_readonly{fstype!=\"\",job=\"node-exporter\"} == 0)",
"severity": "warning"
},
{
"message": "Number of conntrack are getting close to the limit",
"name": "NodeHighNumberConntrackEntriesUsed",
"query": "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75",
"severity": "warning"
},
{
"message": "Network interface is reporting many receive errors.",
"name": "NodeNetworkReceiveErrs",
"query": "increase(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01",
"severity": "warning"
},
{
"message": "Network interface is reporting many transmit errors.",
"name": "NodeNetworkTransmitErrs",
"query": "increase(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01",
"severity": "warning"
},
{
"message": "RAID Array is degraded",
"name": "NodeRAIDDegraded",
"query": "node_md_disks_required - ignoring(state) (node_md_disks{state=\"active\"}) >= 1",
"severity": "critical"
},
{
"message": "Failed device in RAID array",
"name": "NodeRAIDDiskFailure",
"query": "node_md_disks{state=\"failed\"} >= 1",
"severity": "warning"
},
{
"message": "Node Exporter text file collector failed to scrape.",
"name": "NodeTextFileCollectorScrapeError",
"query": "node_textfile_scrape_error{job=\"node-exporter\"} == 1",
"severity": "warning"
},
{
"message": "Network interface is often changing its status",
"name": "NodeNetworkInterfaceFlapping",
Expand Down

0 comments on commit 0c71b05

Please sign in to comment.