postgres-operator: add grafana dashboard and prometheus rules

Signed-off-by: Victor Login <batazor@evrone.com>
shortlink-org · Jun 11, 2023 · 9e6667b · 9e6667b
1 parent 64aa54b
commit 9e6667b
Show file tree

Hide file tree

Showing 2 changed files with 344 additions and 2 deletions.
diff --git a/ops/Helm/addons/store/postgres-operator/templates/grafana/postgres-dashboard.yaml b/ops/Helm/addons/store/postgres-operator/templates/grafana/postgres-dashboard.yaml
@@ -5,6 +5,15 @@ metadata:
   labels:
     grafana_dashboard: "true"
   annotations:
-    grafana_dashboard_folder: DataBase
+    grafana_dashboard_folder: DataBase/Postgres
 data:
-  postgres-dashboard.json.url: "https://raw.githubusercontent.com/cloudnative-pg/cloudnative-pg/main/docs/src/samples/monitoring/grafana-configmap.yaml"
+  postgres_bloat_details.json.url: "https://raw.githubusercontent.com/CrunchyData/pgmonitor/main/grafana/common/Bloat_Details.json"
+  postgres_crud_summary.json.url: "https://raw.githubusercontent.com/CrunchyData/pgmonitor/main/grafana/common/CRUD_Details.json"
+  postgres_etcd_summary.json.url: "https://raw.githubusercontent.com/CrunchyData/pgmonitor/main/grafana/common/ETCD_Details.json"
+  postgres_PGBackrest.json.url: "https://raw.githubusercontent.com/CrunchyData/pgmonitor/main/grafana/common/PGBackrest.json"
+  postgres_PGBouncer.json.url: "https://raw.githubusercontent.com/CrunchyData/pgmonitor/main/grafana/common/PGBouncer.json"
+  postgres_PG_Details.json.url: "https://raw.githubusercontent.com/CrunchyData/pgmonitor/main/grafana/common/PG_Details.json"
+  postgres_PG_Overview.json.url: "https://raw.githubusercontent.com/CrunchyData/pgmonitor/main/grafana/common/PG_Overview.json"
+  postgres_Prometheus_Alerts.json.url: "https://raw.githubusercontent.com/CrunchyData/pgmonitor/main/grafana/common/Prometheus_Alerts.json"
+  postgres_QueryStatistics.json.url: "https://raw.githubusercontent.com/CrunchyData/pgmonitor/main/grafana/common/QueryStatistics.json"
+  postgres_TableSize_Details.json.url: "https://raw.githubusercontent.com/CrunchyData/pgmonitor/main/grafana/common/TableSize_Details.json"
diff --git a/ops/Helm/addons/store/postgres-operator/templates/prometheus_rule.yaml b/ops/Helm/addons/store/postgres-operator/templates/prometheus_rule.yaml
@@ -0,0 +1,333 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: postgres-operator-rules
+spec:
+  groups:
+    - name: postgres-operator.rules
+      rules:
+        ########## EXPORTER RULES ##########
+        - alert: PGExporterScrapeError
+          expr: pg_exporter_last_scrape_error > 0
+          for: 60s
+          labels:
+            service: postgresql
+            severity: critical
+            severity_num: 300
+          annotations:
+            summary: 'Postgres Exporter running on {{ $labels.job }} (instance: {{ $labels.instance }}) is encountering scrape errors processing queries. Error count: ( {{ $value }} )'
+
+        ########## POSTGRESQL RULES ##########
+        - alert: PGIsUp
+          expr: pg_up < 1
+          for: 60s
+          labels:
+            service: postgresql
+            severity: critical
+            severity_num: 300
+          annotations:
+            summary: 'postgres_exporter running on {{ $labels.job }} is unable to communicate with the configured database'
+
+        ## Monitor for data block checksum failures. Only works in PG12+
+        - alert: PGDataChecksum
+          expr: ccp_data_checksum_failure_count > 0
+          for 60s
+          labels:
+            service: postgresql
+            severity: critical
+            severity_num: 300
+          annotations:
+            description: '{{ $labels.job }} has at least one data checksum failure in database {{ $labels.dbname }}. See pg_stat_database system catalog for more information.'
+            summary: 'PGSQL Data Checksum failure'
+
+        - alert: PGIdleTxn
+          expr: ccp_connection_stats_max_idle_in_txn_time > 300
+          for: 60s
+          labels:
+            service: postgresql
+            severity: warning
+            severity_num: 200
+          annotations:
+            description: '{{ $labels.job }} has at least one session idle in transaction for over 5 minutes.'
+            summary: 'PGSQL Instance idle transactions'
+
+          - alert: PGIdleTxn
+            expr: ccp_connection_stats_max_idle_in_txn_time > 900
+            for: 60s
+            labels:
+              service: postgresql
+              severity: critical
+              severity_num: 300
+            annotations:
+              description: '{{ $labels.job }} has at least one session idle in transaction for over 15 minutes.'
+              summary: 'PGSQL Instance idle transactions'
+
+          - alert: PGQueryTime
+            expr: ccp_connection_stats_max_query_time > 43200
+            for: 60s
+            labels:
+              service: postgresql
+              severity: warning
+              severity_num: 200
+            annotations:
+              description: '{{ $labels.job }} has at least one query running for over 12 hours.'
+              summary: 'PGSQL Max Query Runtime'
+
+          - alert: PGQueryTime
+            expr: ccp_connection_stats_max_query_time > 86400
+            for: 60s
+            labels:
+              service: postgresql
+              severity: critical
+              severity_num: 300
+            annotations:
+              description: '{{ $labels.job }} has at least one query running for over 1 day.'
+              summary: 'PGSQL Max Query Runtime'
+
+          - alert: PGConnPerc
+            expr: 100 * (ccp_connection_stats_total / ccp_connection_stats_max_connections) > 75
+            for: 60s
+            labels:
+              service: postgresql
+              severity: warning
+              severity_num: 200
+            annotations:
+              description: '{{ $labels.job }} is using 75% or more of available connections ({{ $value }}%)'
+              summary: 'PGSQL Instance connections'
+
+          - alert: PGConnPerc
+            expr: 100 * (ccp_connection_stats_total / ccp_connection_stats_max_connections) > 90
+            for: 60s
+            labels:
+              service: postgresql
+              severity: critical
+              severity_num: 300
+            annotations:
+              description: '{{ $labels.job }} is using 90% or more of available connections ({{ $value }}%)'
+              summary: 'PGSQL Instance connections'
+
+          - alert: PGDBSize
+            expr: ccp_database_size_bytes > 1.073741824e+11
+            for: 60s
+            labels:
+              service: postgresql
+              severity: warning
+              severity_num: 200
+            annotations:
+              description: 'PGSQL Instance {{ $labels.job }} over 100GB in size: {{ $value }} bytes'
+              summary: 'PGSQL Instance size warning'
+
+          - alert: PGDBSize
+            expr: ccp_database_size_bytes > 2.68435456e+11
+            for: 60s
+            labels:
+              service: postgresql
+              severity: critical
+              severity_num: 300
+            annotations:
+              description: 'PGSQL Instance {{ $labels.job }} over 250GB in size: {{ $value }} bytes'
+              summary: 'PGSQL Instance size critical'
+
+          - alert: PGReplicationByteLag
+            expr: ccp_replication_lag_size_bytes > 5.24288e+07
+            for: 60s
+            labels:
+              service: postgresql
+              severity: warning
+              severity_num: 200
+            annotations:
+              description: 'PGSQL Instance {{ $labels.job }} has at least one replica lagging over 50MB behind.'
+              summary: 'PGSQL Instance replica lag warning'
+
+          - alert: PGReplicationByteLag
+            expr: ccp_replication_lag_size_bytes > 1.048576e+08
+            for: 60s
+            labels:
+              service: postgresql
+              severity: critical
+              severity_num: 300
+            annotations:
+              description: 'PGSQL Instance {{ $labels.job }} has at least one replica lagging over 100MB behind.'
+              summary: 'PGSQL Instance replica lag warning'
+
+          - alert: PGReplicationSlotsInactive
+            expr: ccp_replication_slots_active == 0
+            for: 60s
+            labels:
+              service: postgresql
+              severity: critical
+              severity_num: 300
+            annotations:
+              description: 'PGSQL Instance {{ $labels.job }} has one or more inactive replication slots'
+              summary: 'PGSQL Instance inactive replication slot'
+
+          - alert: PGXIDWraparound
+            expr: ccp_transaction_wraparound_percent_towards_wraparound > 50
+            for: 60s
+            labels:
+              service: postgresql
+              severity: warning
+              severity_num: 200
+            annotations:
+              description: 'PGSQL Instance {{ $labels.job }} is over 50% towards transaction id wraparound.'
+              summary: 'PGSQL Instance {{ $labels.job }} transaction id wraparound imminent'
+
+          - alert: PGXIDWraparound
+            expr: ccp_transaction_wraparound_percent_towards_wraparound > 75
+            for: 60s
+            labels:
+              service: postgresql
+              severity: critical
+              severity_num: 300
+            annotations:
+              description: 'PGSQL Instance {{ $labels.job }} is over 75% towards transaction id wraparound.'
+              summary: 'PGSQL Instance transaction id wraparound imminent'
+
+          - alert: PGEmergencyVacuum
+            expr: ccp_transaction_wraparound_percent_towards_emergency_autovac > 110
+            for: 60s
+            labels:
+              service: postgresql
+              severity: warning
+              severity_num: 200
+            annotations:
+              description: 'PGSQL Instance {{ $labels.job }} is over 110% beyond autovacuum_freeze_max_age value. Autovacuum may need tuning to better keep up.'
+              summary: 'PGSQL Instance emergency vacuum imminent'
+
+          - alert: PGEmergencyVacuum
+            expr: ccp_transaction_wraparound_percent_towards_emergency_autovac > 125
+            for: 60s
+            labels:
+              service: postgresql
+              severity: critical
+              severity_num: 300
+            annotations:
+              description: 'PGSQL Instance {{ $labels.job }} is over 125% beyond autovacuum_freeze_max_age value. Autovacuum needs tuning to better keep up.'
+              summary: 'PGSQL Instance emergency vacuum imminent'
+
+          - alert: PGArchiveCommandStatus
+            expr: ccp_archive_command_status_seconds_since_last_fail > 300
+            for: 60s
+            labels:
+                service: postgresql
+                severity: critical
+                severity_num: 300
+            annotations:
+                description: 'PGSQL Instance {{ $labels.job }} has a recent failing archive command'
+                summary: 'Seconds since the last recorded failure of the archive_command'
+
+          - alert: PGSequenceExhaustion
+            expr: ccp_sequence_exhaustion_count > 0
+            for: 60s
+            labels:
+                service: postgresql
+                severity: critical
+                severity_num: 300
+            annotations:
+                description: 'Count of sequences on instance {{ $labels.job }} at over 75% usage: {{ $value }}. Run following query to see full sequence status: SELECT * FROM monitor.sequence_status() WHERE percent >= 75'
+
+          - alert: PGSettingsPendingRestart
+            expr: ccp_settings_pending_restart_count > 0
+            for: 60s
+            labels:
+                service: postgresql
+                severity: critical
+                severity_num: 300
+            annotations:
+                description: 'One or more settings in the pg_settings system catalog on system {{ $labels.job }} are in a pending_restart state. Check the system catalog for which settings are pending and review postgresql.conf for changes.'
+
+          ########## PGBACKREST RULES ##########
+          ##
+          ## Uncomment and customize one or more of these rules to monitor your pgbackrest backups.
+          ## Full backups are considered the equivalent of both differentials and incrementals since both are based on the last full
+          ##   And differentials are considered incrementals since incrementals will be based off the last diff if one exists
+          ##   This avoid false alerts, for example when you don't run diff/incr backups on the days that you run a full
+          ## Stanza should also be set if different intervals are expected for each stanza.
+          ##   Otherwise rule will be applied to all stanzas returned on target system if not set.
+          ##
+          ## Relevant metric names are:
+          ##   ccp_backrest_last_full_backup_time_since_completion_seconds
+          ##   ccp_backrest_last_incr_backup_time_since_completion_seconds
+          ##   ccp_backrest_last_diff_backup_time_since_completion_seconds
+          ##
+          ## To avoid false positives on backup time alerts, 12 hours are added onto each threshold to allow a buffer if the backup runtime varies from day to day.
+          ##    Further adjustment may be needed depending on your backup runtimes/schedule.
+          #
+          #  - alert: PGBackRestLastCompletedFull_main
+          #    expr: ccp_backrest_last_full_backup_time_since_completion_seconds{stanza="main"} > 648000
+          #    for: 60s
+          #    labels:
+          #       service: postgresql
+          #       severity: critical
+          #       severity_num: 300
+          #    annotations:
+          #       summary: 'Full backup for stanza [main] on system {{ $labels.job }} has not completed in the last week.'
+          #
+          #  - alert: PGBackRestLastCompletedIncr_main
+          #    expr: ccp_backrest_last_incr_backup_time_since_completion_seconds{stanza="main"} > 129600
+          #    for: 60s
+          #    labels:
+          #       service: postgresql
+          #       severity: critical
+          #       severity_num: 300
+          #    annotations:
+          #       summary: 'Incremental backup for stanza [main] on system {{ $labels.job }} has not completed in the last 24 hours.'
+          #
+          #
+          ## Runtime monitoring is handled with a single metric:
+          ##
+          ##   ccp_backrest_last_info_backup_runtime_seconds
+          ##
+          ## Runtime monitoring should have the "backup_type" label set.
+          ##   Otherwise the rule will apply to the last run of all backup types returned (full, diff, incr)
+          ## Stanza should also be set if runtimes per stanza have different expected times
+          #
+          #  - alert: PGBackRestLastRuntimeFull_main
+          #    expr: ccp_backrest_last_info_backup_runtime_seconds{backup_type="full", stanza="main"} > 14400
+          #    for: 60s
+          #    labels:
+          #       service: postgresql
+          #       severity: critical
+          #       severity_num: 300
+          #    annotations:
+          #       summary: 'Expected runtime of full backup for stanza [main] has exceeded 4 hours'
+          #
+          #  - alert: PGBackRestLastRuntimeDiff_main
+          #    expr: ccp_backrest_last_info_backup_runtime_seconds{backup_type="diff", stanza="main"} > 3600
+          #    for: 60s
+          #    labels:
+          #       service: postgresql
+          #       severity: critical
+          #       severity_num: 300
+          #    annotations:
+          #       summary: 'Expected runtime of diff backup for stanza [main] has exceeded 1 hour'
+          #
+          ## As of pgBackRest version 2.36 errors encountered during a completed backup run (checksum failure, file truncation,
+          ## invalid header, etc) can be detected and are reported in the info.
+          #
+          #  - alert: PGBackRestErrorDuringBackup
+          #    expr: ccp_backrest_last_info_backup_error > 0
+          #    for: 60s
+          #    labels:
+          #       service: postgresql
+          #       severity: critical
+          #       severity_num: 300
+          #    annotations:
+          #       summary: 'Error encountered during pgBackRest backup operation. See logs for more information.'
+          #
+          ## If the pgbackrest command fails to run, the metric disappears from the exporter output and the alert never fires.
+          ## An absence alert must be configured explicitly for each target (job) that backups are being monitored.
+          ## Checking for absence of just the full backup type should be sufficient (no need for diff/incr).
+          ## Note that while the backrest check command failing will likely also cause a scrape error alert, the addition of this
+          ## check gives a clearer answer as to what is causing it and that something is wrong with the backups.
+          #
+          #  - alert: PGBackrestAbsentFull_Prod
+          #    expr: absent(ccp_backrest_last_full_backup_time_since_completion_seconds{job="Prod"})
+          #    for: 10s
+          #    labels:
+          #      service: postgresql
+          #      severity: critical
+          #      severity_num: 300
+          #    annotations:
+          #      description: 'Backup Full status missing for Prod. Check that pgbackrest info command is working on target system.'
+