forked from openshift/cluster-monitoring-operator
-
Notifications
You must be signed in to change notification settings - Fork 1
/
rules.jsonnet
71 lines (71 loc) · 2.89 KB
/
rules.jsonnet
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
{
prometheusRules+:: {
groups+: [
{
name: 'kubernetes.rules',
rules: [
{
expr: 'sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name, namespace)',
record: 'pod_name:container_memory_usage_bytes:sum',
},
{
expr: 'sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name, namespace)',
record: 'pod_name:container_spec_cpu_shares:sum',
},
{
expr: 'sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m])) BY (pod_name, namespace)',
record: 'pod_name:container_cpu_usage:sum',
},
{
expr: 'sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name, namespace)',
record: 'pod_name:container_fs_usage_bytes:sum',
},
{
expr: 'sum(container_memory_usage_bytes{container_name!=""}) BY (namespace)',
record: 'namespace:container_memory_usage_bytes:sum',
},
{
expr: 'sum(container_spec_cpu_shares{container_name!=""}) BY (namespace)',
record: 'namespace:container_spec_cpu_shares:sum',
},
{
expr: 'sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m])) BY (namespace)',
record: 'namespace:container_cpu_usage:sum',
},
{
expr: 'sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) BY (cluster)',
record: 'cluster:memory_usage:ratio',
},
{
expr: 'sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000 / sum(machine_cpu_cores)',
record: 'cluster:container_spec_cpu_shares:ratio',
},
{
expr: 'sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m])) / sum(machine_cpu_cores)',
record: 'cluster:container_cpu_usage:ratio',
},
{
expr: 'sum(rate(cluster_monitoring_operator_reconcile_errors_total[15m])) * 100 / sum(rate(cluster_monitoring_operator_reconcile_attempts_total[15m])) > 10',
alert: 'ClusterMonitoringOperatorErrors',
'for': '15m',
annotations: {
message: 'Cluster Monitoring Operator is experiencing {{ printf "%0.0f" $value }}% errors.',
},
labels: {
severity: 'critical',
},
},
],
},
{
name: 'openshift-build.rules',
rules: [
{
expr: 'sum(openshift_build_total{job="kubernetes-apiservers",phase="Error"})/(sum(openshift_build_total{job="kubernetes-apiservers",phase=~"Failed|Complete|Error"}))',
record: 'build_error_rate',
},
],
},
],
},
}