Skip to content

Commit

Permalink
postgres-operator: add grafana dashboard and prometheus rules
Browse files Browse the repository at this point in the history
Signed-off-by: Victor Login <batazor@evrone.com>
  • Loading branch information
batazor committed Jun 11, 2023
1 parent 64aa54b commit 9e6667b
Show file tree
Hide file tree
Showing 2 changed files with 344 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,15 @@ metadata:
labels:
grafana_dashboard: "true"
annotations:
grafana_dashboard_folder: DataBase
grafana_dashboard_folder: DataBase/Postgres
data:
postgres-dashboard.json.url: "https://raw.githubusercontent.com/cloudnative-pg/cloudnative-pg/main/docs/src/samples/monitoring/grafana-configmap.yaml"
postgres_bloat_details.json.url: "https://raw.githubusercontent.com/CrunchyData/pgmonitor/main/grafana/common/Bloat_Details.json"
postgres_crud_summary.json.url: "https://raw.githubusercontent.com/CrunchyData/pgmonitor/main/grafana/common/CRUD_Details.json"
postgres_etcd_summary.json.url: "https://raw.githubusercontent.com/CrunchyData/pgmonitor/main/grafana/common/ETCD_Details.json"
postgres_PGBackrest.json.url: "https://raw.githubusercontent.com/CrunchyData/pgmonitor/main/grafana/common/PGBackrest.json"
postgres_PGBouncer.json.url: "https://raw.githubusercontent.com/CrunchyData/pgmonitor/main/grafana/common/PGBouncer.json"
postgres_PG_Details.json.url: "https://raw.githubusercontent.com/CrunchyData/pgmonitor/main/grafana/common/PG_Details.json"
postgres_PG_Overview.json.url: "https://raw.githubusercontent.com/CrunchyData/pgmonitor/main/grafana/common/PG_Overview.json"
postgres_Prometheus_Alerts.json.url: "https://raw.githubusercontent.com/CrunchyData/pgmonitor/main/grafana/common/Prometheus_Alerts.json"
postgres_QueryStatistics.json.url: "https://raw.githubusercontent.com/CrunchyData/pgmonitor/main/grafana/common/QueryStatistics.json"
postgres_TableSize_Details.json.url: "https://raw.githubusercontent.com/CrunchyData/pgmonitor/main/grafana/common/TableSize_Details.json"
333 changes: 333 additions & 0 deletions ops/Helm/addons/store/postgres-operator/templates/prometheus_rule.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,333 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: postgres-operator-rules
spec:
groups:
- name: postgres-operator.rules
rules:
########## EXPORTER RULES ##########
- alert: PGExporterScrapeError
expr: pg_exporter_last_scrape_error > 0
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
summary: 'Postgres Exporter running on {{ $labels.job }} (instance: {{ $labels.instance }}) is encountering scrape errors processing queries. Error count: ( {{ $value }} )'

########## POSTGRESQL RULES ##########
- alert: PGIsUp
expr: pg_up < 1
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
summary: 'postgres_exporter running on {{ $labels.job }} is unable to communicate with the configured database'

## Monitor for data block checksum failures. Only works in PG12+
- alert: PGDataChecksum
expr: ccp_data_checksum_failure_count > 0
for 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
description: '{{ $labels.job }} has at least one data checksum failure in database {{ $labels.dbname }}. See pg_stat_database system catalog for more information.'
summary: 'PGSQL Data Checksum failure'

- alert: PGIdleTxn
expr: ccp_connection_stats_max_idle_in_txn_time > 300
for: 60s
labels:
service: postgresql
severity: warning
severity_num: 200
annotations:
description: '{{ $labels.job }} has at least one session idle in transaction for over 5 minutes.'
summary: 'PGSQL Instance idle transactions'

- alert: PGIdleTxn
expr: ccp_connection_stats_max_idle_in_txn_time > 900
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
description: '{{ $labels.job }} has at least one session idle in transaction for over 15 minutes.'
summary: 'PGSQL Instance idle transactions'

- alert: PGQueryTime
expr: ccp_connection_stats_max_query_time > 43200
for: 60s
labels:
service: postgresql
severity: warning
severity_num: 200
annotations:
description: '{{ $labels.job }} has at least one query running for over 12 hours.'
summary: 'PGSQL Max Query Runtime'

- alert: PGQueryTime
expr: ccp_connection_stats_max_query_time > 86400
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
description: '{{ $labels.job }} has at least one query running for over 1 day.'
summary: 'PGSQL Max Query Runtime'

- alert: PGConnPerc
expr: 100 * (ccp_connection_stats_total / ccp_connection_stats_max_connections) > 75
for: 60s
labels:
service: postgresql
severity: warning
severity_num: 200
annotations:
description: '{{ $labels.job }} is using 75% or more of available connections ({{ $value }}%)'
summary: 'PGSQL Instance connections'

- alert: PGConnPerc
expr: 100 * (ccp_connection_stats_total / ccp_connection_stats_max_connections) > 90
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
description: '{{ $labels.job }} is using 90% or more of available connections ({{ $value }}%)'
summary: 'PGSQL Instance connections'

- alert: PGDBSize
expr: ccp_database_size_bytes > 1.073741824e+11
for: 60s
labels:
service: postgresql
severity: warning
severity_num: 200
annotations:
description: 'PGSQL Instance {{ $labels.job }} over 100GB in size: {{ $value }} bytes'
summary: 'PGSQL Instance size warning'

- alert: PGDBSize
expr: ccp_database_size_bytes > 2.68435456e+11
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
description: 'PGSQL Instance {{ $labels.job }} over 250GB in size: {{ $value }} bytes'
summary: 'PGSQL Instance size critical'

- alert: PGReplicationByteLag
expr: ccp_replication_lag_size_bytes > 5.24288e+07
for: 60s
labels:
service: postgresql
severity: warning
severity_num: 200
annotations:
description: 'PGSQL Instance {{ $labels.job }} has at least one replica lagging over 50MB behind.'
summary: 'PGSQL Instance replica lag warning'

- alert: PGReplicationByteLag
expr: ccp_replication_lag_size_bytes > 1.048576e+08
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
description: 'PGSQL Instance {{ $labels.job }} has at least one replica lagging over 100MB behind.'
summary: 'PGSQL Instance replica lag warning'

- alert: PGReplicationSlotsInactive
expr: ccp_replication_slots_active == 0
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
description: 'PGSQL Instance {{ $labels.job }} has one or more inactive replication slots'
summary: 'PGSQL Instance inactive replication slot'

- alert: PGXIDWraparound
expr: ccp_transaction_wraparound_percent_towards_wraparound > 50
for: 60s
labels:
service: postgresql
severity: warning
severity_num: 200
annotations:
description: 'PGSQL Instance {{ $labels.job }} is over 50% towards transaction id wraparound.'
summary: 'PGSQL Instance {{ $labels.job }} transaction id wraparound imminent'

- alert: PGXIDWraparound
expr: ccp_transaction_wraparound_percent_towards_wraparound > 75
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
description: 'PGSQL Instance {{ $labels.job }} is over 75% towards transaction id wraparound.'
summary: 'PGSQL Instance transaction id wraparound imminent'

- alert: PGEmergencyVacuum
expr: ccp_transaction_wraparound_percent_towards_emergency_autovac > 110
for: 60s
labels:
service: postgresql
severity: warning
severity_num: 200
annotations:
description: 'PGSQL Instance {{ $labels.job }} is over 110% beyond autovacuum_freeze_max_age value. Autovacuum may need tuning to better keep up.'
summary: 'PGSQL Instance emergency vacuum imminent'

- alert: PGEmergencyVacuum
expr: ccp_transaction_wraparound_percent_towards_emergency_autovac > 125
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
description: 'PGSQL Instance {{ $labels.job }} is over 125% beyond autovacuum_freeze_max_age value. Autovacuum needs tuning to better keep up.'
summary: 'PGSQL Instance emergency vacuum imminent'

- alert: PGArchiveCommandStatus
expr: ccp_archive_command_status_seconds_since_last_fail > 300
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
description: 'PGSQL Instance {{ $labels.job }} has a recent failing archive command'
summary: 'Seconds since the last recorded failure of the archive_command'

- alert: PGSequenceExhaustion
expr: ccp_sequence_exhaustion_count > 0
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
description: 'Count of sequences on instance {{ $labels.job }} at over 75% usage: {{ $value }}. Run following query to see full sequence status: SELECT * FROM monitor.sequence_status() WHERE percent >= 75'

- alert: PGSettingsPendingRestart
expr: ccp_settings_pending_restart_count > 0
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
description: 'One or more settings in the pg_settings system catalog on system {{ $labels.job }} are in a pending_restart state. Check the system catalog for which settings are pending and review postgresql.conf for changes.'

########## PGBACKREST RULES ##########
##
## Uncomment and customize one or more of these rules to monitor your pgbackrest backups.
## Full backups are considered the equivalent of both differentials and incrementals since both are based on the last full
## And differentials are considered incrementals since incrementals will be based off the last diff if one exists
## This avoid false alerts, for example when you don't run diff/incr backups on the days that you run a full
## Stanza should also be set if different intervals are expected for each stanza.
## Otherwise rule will be applied to all stanzas returned on target system if not set.
##
## Relevant metric names are:
## ccp_backrest_last_full_backup_time_since_completion_seconds
## ccp_backrest_last_incr_backup_time_since_completion_seconds
## ccp_backrest_last_diff_backup_time_since_completion_seconds
##
## To avoid false positives on backup time alerts, 12 hours are added onto each threshold to allow a buffer if the backup runtime varies from day to day.
## Further adjustment may be needed depending on your backup runtimes/schedule.
#
# - alert: PGBackRestLastCompletedFull_main
# expr: ccp_backrest_last_full_backup_time_since_completion_seconds{stanza="main"} > 648000
# for: 60s
# labels:
# service: postgresql
# severity: critical
# severity_num: 300
# annotations:
# summary: 'Full backup for stanza [main] on system {{ $labels.job }} has not completed in the last week.'
#
# - alert: PGBackRestLastCompletedIncr_main
# expr: ccp_backrest_last_incr_backup_time_since_completion_seconds{stanza="main"} > 129600
# for: 60s
# labels:
# service: postgresql
# severity: critical
# severity_num: 300
# annotations:
# summary: 'Incremental backup for stanza [main] on system {{ $labels.job }} has not completed in the last 24 hours.'
#
#
## Runtime monitoring is handled with a single metric:
##
## ccp_backrest_last_info_backup_runtime_seconds
##
## Runtime monitoring should have the "backup_type" label set.
## Otherwise the rule will apply to the last run of all backup types returned (full, diff, incr)
## Stanza should also be set if runtimes per stanza have different expected times
#
# - alert: PGBackRestLastRuntimeFull_main
# expr: ccp_backrest_last_info_backup_runtime_seconds{backup_type="full", stanza="main"} > 14400
# for: 60s
# labels:
# service: postgresql
# severity: critical
# severity_num: 300
# annotations:
# summary: 'Expected runtime of full backup for stanza [main] has exceeded 4 hours'
#
# - alert: PGBackRestLastRuntimeDiff_main
# expr: ccp_backrest_last_info_backup_runtime_seconds{backup_type="diff", stanza="main"} > 3600
# for: 60s
# labels:
# service: postgresql
# severity: critical
# severity_num: 300
# annotations:
# summary: 'Expected runtime of diff backup for stanza [main] has exceeded 1 hour'
#
## As of pgBackRest version 2.36 errors encountered during a completed backup run (checksum failure, file truncation,
## invalid header, etc) can be detected and are reported in the info.
#
# - alert: PGBackRestErrorDuringBackup
# expr: ccp_backrest_last_info_backup_error > 0
# for: 60s
# labels:
# service: postgresql
# severity: critical
# severity_num: 300
# annotations:
# summary: 'Error encountered during pgBackRest backup operation. See logs for more information.'
#
## If the pgbackrest command fails to run, the metric disappears from the exporter output and the alert never fires.
## An absence alert must be configured explicitly for each target (job) that backups are being monitored.
## Checking for absence of just the full backup type should be sufficient (no need for diff/incr).
## Note that while the backrest check command failing will likely also cause a scrape error alert, the addition of this
## check gives a clearer answer as to what is causing it and that something is wrong with the backups.
#
# - alert: PGBackrestAbsentFull_Prod
# expr: absent(ccp_backrest_last_full_backup_time_since_completion_seconds{job="Prod"})
# for: 10s
# labels:
# service: postgresql
# severity: critical
# severity_num: 300
# annotations:
# description: 'Backup Full status missing for Prod. Check that pgbackrest info command is working on target system.'

0 comments on commit 9e6667b

Please sign in to comment.