New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Prometheus Monitoring Mixin for Prometheus itself. #4474

Open
wants to merge 5 commits into
base: master
from
Jump to file or symbol
Failed to load files and symbols.
+199 −0
Diff settings

Always

Just for now

@@ -0,0 +1,79 @@
{
prometheusAlerts+:: {
groups+: [
{
name: 'prometheus',
rules: [
{
alert: 'PromBadConfig',
expr: |||
prometheus_config_last_reload_successful{%(prometheusSelector)s} == 0
||| % $._config,
'for': '15m',
labels: {
severity: 'critical',
},
annotations: {
mesage: 'Prometheus failed to reload config, see container logs',
},
},
{
alert: 'PromAlertmanagerBadConfig',
expr: |||
alertmanager_config_last_reload_successful{%(alertmanagerSelector)s} == 0
||| % $._config,
'for': '10m',
labels: {
severity: 'critical',
},
annotations: {
message: 'Alertmanager failed to reload config, see container logs',
},
},
{
alert: 'PromAlertsFailed',
expr: |||
100 * rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s}[5m]) / rate(alertmanager_notifications_total{%(alertmanagerSelector)s}[5m]) > 1
||| % $._config,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
message: 'Alertmanager failed to send {{ printf "%.1f" $value }}% alerts to {{ $labels.integration }}.',
},
},
{
alert: 'PromRemoteStorageFailures',
expr: |||
(rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[1m]) * 100)
/
(rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[1m]) + rate(prometheus_remote_storage_succeeded_samples_total{%(prometheusSelector)s}[1m]))
> 1
||| % $._config,
'for': '15m',
labels: {
severity: 'critical',
},
annotations: {
message: 'Prometheus failed to send {{ printf "%.1f" $value }}% samples',
},
},
{
alert: 'PromRuleFailures',
'for': '15m',
expr: |||
rate(prometheus_rule_evaluation_failures_total{%(prometheusSelector)s}[1m]) > 0
||| % $._config,
labels: {
severity: 'critical',
},
annotations: {
message: 'Prometheus failed to evaluate {{ printf "%.1f" $value }} rules / s',
},
},
],
},
],
},
}
@@ -0,0 +1,7 @@
{
_config+:: {
// Selectors are inserted between {} in Prometheus queries.
prometheusSelector: 'job="prometheus"',
alertmanagerSelector: 'job="alertmanager"',
},
}
@@ -0,0 +1,96 @@
local g = import 'grafana-builder/grafana.libsonnet';
{
dashboards+: {
'prometheus.json':
g.dashboard('Prometheus')
.addMultiTemplate('job', 'prometheus_build_info', 'job')
.addMultiTemplate('instance', 'prometheus_build_info', 'instance')
# Prometheus is quite commonly configured with honor_labels set to true;
# therefor job and instance is not the prometheus server in many queries!.
.addRow(
g.row('Prometheus Stats')
.addPanel(
g.panel('Prometheus Stats') +
g.tablePanel([
'count by (job, instance, version) (prometheus_build_info{job=~"$job", instance=~"$instance"})',
'max by (job, instance) (time() - process_start_time_seconds{job=~"$job", instance=~"$instance"})',
], {
job: { alias: 'Job' },
instance: { alias: 'Instance' },
verstion: { alias: 'Version' },
'Value #A': { alias: 'Count', type: 'hidden' },
'Value #B': { alias: 'Uptime' },
})
)
)
.addRow(
g.row('Discovery')
.addPanel(
g.panel('Target Sync') +
g.queryPanel('sum(rate(prometheus_target_sync_length_seconds_sum{job=~"$job",instance=~"$instance"}[2m])) by (scrape_job) * 1e3', '{{scrape_job}}') +
{ yaxes: g.yaxes('ms') }
)
.addPanel(
g.panel('Targets') +
g.queryPanel('count(up{})', 'Targets') +
g.stack
)
)
.addRow(
g.row('Retrieval')
.addPanel(
g.panel('Target Scrape Duration') +
g.queryPanel('1e3 * sum(scrape_duration_seconds) / count(scrape_duration_seconds)', 'Average') +
{ yaxes: g.yaxes('ms') }
)
.addPanel(
g.panel('Scrape failures') +
g.queryPanel([
'sum by (job) (rate(prometheus_target_scrapes_exceeded_sample_limit_total[1m]))',
'sum by (job) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total[1m]))',
'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_bounds_total[1m]))',
'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_order_total[1m]))',
], [
'exceeded sample limit: {{job}}',
'duplicate timestamp: {{job}}',
'out of bounds: {{job}}',
'out of order: {{job}}',
]) +
g.stack
)
.addPanel(
g.panel('Appended Samples') +
g.queryPanel('rate(prometheus_tsdb_head_samples_appended_total{job=~"$job",instance=~"$instance"}[1m])', '{{job}} {{instance}}') +
g.stack
)
)
.addRow(
g.row('Storage')
.addPanel(
g.panel('Head Series') +
g.queryPanel('prometheus_tsdb_head_series{job=~"$job",instance=~"$instance"}', '{{job}} {{instance}} head series') +
g.stack
)
.addPanel(
g.panel('Head Chunks') +
g.queryPanel('prometheus_tsdb_head_chunks{job=~"$job",instance=~"$instance"}', '{{job}} {{instance}} head chunks') +
g.stack
)
)
.addRow(
g.row('Query')
.addPanel(
g.panel('Query Rate') +
g.queryPanel('rate(prometheus_engine_query_duration_seconds_count{job=~"$job",instance=~"$instance",slice="inner_eval"}[1m])', '{{job}} {{instance}}') +
g.stack,
)
.addPanel(
g.panel('Stage Duration') +
g.queryPanel('max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",job=~"$job",instance=~"$instance"}) * 1e3', '{{slice}}') +
{ yaxes: g.yaxes('ms') } +
g.stack,
)
)
},
}
@@ -0,0 +1,14 @@
{
"dependencies": [
{
"name": "grafana-builder",
"source": {
"git": {
"remote": "https://github.com/kausalco/public",
"subdir": "grafana-builder"
}
},
"version": "master"
}
]
}
@@ -0,0 +1,3 @@
(import 'config.libsonnet') +
(import 'dashboards.libsonnet') +
(import 'alerts.libsonnet')
ProTip! Use n and p to navigate between commits in a pull request.