Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ flag. This removes all built-in metrics, and uses only metrics defined by querie

### Automatically discover databases
To scrape metrics from all databases on a database server, the database DSN's can be dynamically discovered via the
`--auto-discover-databases` flag. When true, `SELECT datname FROM pg_database WHERE datallowconn = true AND datistemplate = false` is run for all configured DSN's. From the
`--auto-discover-databases` flag. When true, `SELECT datname FROM pg_database WHERE datallowconn = true AND datistemplate = false and datname != current_database()` is run for all configured DSN's. From the
result a new set of DSN's is created for which the metrics are scraped.

In addition, the option `--exclude-databases` adds the possibily to filter the result from the auto discovery to discard databases you do not need.
Expand Down
39 changes: 31 additions & 8 deletions cmd/postgres_exporter/postgres_exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ type Mapping map[string]MappingOptions
type UserQuery struct {
Query string `yaml:"query"`
Metrics []Mapping `yaml:"metrics"`
Master bool `yaml:"master"` // Querying only for master database
CacheSeconds uint64 `yaml:"cache_seconds"` // Number of seconds to cache the namespace result metrics for.
}

Expand Down Expand Up @@ -139,13 +140,15 @@ func (cm *ColumnMapping) UnmarshalYAML(unmarshal func(interface{}) error) error
// This is mainly so we can parse cacheSeconds around.
type intermediateMetricMap struct {
columnMappings map[string]ColumnMapping
master bool
cacheSeconds uint64
}

// MetricMapNamespace groups metric maps under a shared set of labels.
type MetricMapNamespace struct {
labels []string // Label names for this namespace
columnMappings map[string]MetricMap // Column mappings in this namespace
master bool // Call query only for master database
cacheSeconds uint64 // Number of seconds this metric namespace can be cached. 0 disables.
}

Expand Down Expand Up @@ -211,6 +214,7 @@ var builtinMetricMaps = map[string]intermediateMetricMap{
"blk_write_time": {COUNTER, "Time spent writing data file blocks by backends in this database, in milliseconds", nil, nil},
"stats_reset": {COUNTER, "Time at which these statistics were last reset", nil, nil},
},
true,
0,
},
"pg_stat_database_conflicts": {
Expand All @@ -223,6 +227,7 @@ var builtinMetricMaps = map[string]intermediateMetricMap{
"confl_bufferpin": {COUNTER, "Number of queries in this database that have been canceled due to pinned buffers", nil, nil},
"confl_deadlock": {COUNTER, "Number of queries in this database that have been canceled due to deadlocks", nil, nil},
},
true,
0,
},
"pg_locks": {
Expand All @@ -231,6 +236,7 @@ var builtinMetricMaps = map[string]intermediateMetricMap{
"mode": {LABEL, "Type of Lock", nil, nil},
"count": {GAUGE, "Number of locks", nil, nil},
},
true,
0,
},
"pg_stat_replication": {
Expand Down Expand Up @@ -276,6 +282,7 @@ var builtinMetricMaps = map[string]intermediateMetricMap{
"flush_lag": {DISCARD, "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written and flushed it (but not yet applied it). This can be used to gauge the delay that synchronous_commit level remote_flush incurred while committing if this server was configured as a synchronous standby.", nil, semver.MustParseRange(">=10.0.0")},
"replay_lag": {DISCARD, "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written, flushed and applied it. This can be used to gauge the delay that synchronous_commit level remote_apply incurred while committing if this server was configured as a synchronous standby.", nil, semver.MustParseRange(">=10.0.0")},
},
true,
0,
},
"pg_stat_activity": {
Expand All @@ -285,6 +292,7 @@ var builtinMetricMaps = map[string]intermediateMetricMap{
"count": {GAUGE, "number of connections in this state", nil, nil},
"max_tx_duration": {GAUGE, "max duration in seconds any active transaction has been running", nil, nil},
},
true,
0,
},
}
Expand Down Expand Up @@ -444,6 +452,7 @@ func parseUserQueries(content []byte) (map[string]intermediateMetricMap, map[str
newMetricMap := make(map[string]ColumnMapping)
metricMap = intermediateMetricMap{
columnMappings: newMetricMap,
master: specs.Master,
cacheSeconds: specs.CacheSeconds,
}
metricMaps[metric] = metricMap
Expand Down Expand Up @@ -614,7 +623,7 @@ func makeDescMap(pgVersion semver.Version, serverLabels prometheus.Labels, metri
}
}

metricMap[namespace] = MetricMapNamespace{variableLabels, thisMap, intermediateMappings.cacheSeconds}
metricMap[namespace] = MetricMapNamespace{variableLabels, thisMap, intermediateMappings.master, intermediateMappings.cacheSeconds}
}

return metricMap
Expand Down Expand Up @@ -857,7 +866,7 @@ func (s *Server) Scrape(ch chan<- prometheus.Metric, disableSettingsMetrics bool

var err error

if (!disableSettingsMetrics && !*autoDiscoverDatabases) || (!disableSettingsMetrics && *autoDiscoverDatabases && s.master) {
if !disableSettingsMetrics && s.master {
if err = querySettings(ch, s); err != nil {
err = fmt.Errorf("error retrieving settings: %s", err)
}
Expand Down Expand Up @@ -1257,6 +1266,12 @@ func queryNamespaceMappings(ch chan<- prometheus.Metric, server *Server) map[str

for namespace, mapping := range server.metricMap {
log.Debugln("Querying namespace: ", namespace)

if mapping.master && !server.master {
log.Debugln("Query skipped...")
continue
}

scrapeMetric := false
// Check if the metric is cached
server.cacheMtx.Lock()
Expand Down Expand Up @@ -1335,12 +1350,13 @@ func (e *Exporter) checkMapVersions(ch chan<- prometheus.Metric, server *Server)
log.Infof("Semantic Version Changed on %q: %s -> %s", server, server.lastMapVersion, semanticVersion)
server.mappingMtx.Lock()

if e.disableDefaultMetrics || (!e.disableDefaultMetrics && e.autoDiscoverDatabases && !server.master) {
server.metricMap = make(map[string]MetricMapNamespace)
server.queryOverrides = make(map[string]string)
} else {
// Get Default Metrics only for master database
if !e.disableDefaultMetrics && server.master {
server.metricMap = makeDescMap(semanticVersion, server.labels, e.builtinMetricMaps)
server.queryOverrides = makeQueryOverrideMap(semanticVersion, queryOverrides)
} else {
server.metricMap = make(map[string]MetricMapNamespace)
server.queryOverrides = make(map[string]string)
}

server.lastMapVersion = semanticVersion
Expand Down Expand Up @@ -1370,11 +1386,11 @@ func (e *Exporter) checkMapVersions(ch chan<- prometheus.Metric, server *Server)
server.mappingMtx.Unlock()
}

// Output the version as a special metric
// Output the version as a special metric only for master database
versionDesc := prometheus.NewDesc(fmt.Sprintf("%s_%s", namespace, staticLabelName),
"Version string as reported by postgres", []string{"version", "short_version"}, server.labels)

if !e.disableDefaultMetrics && (server.master && e.autoDiscoverDatabases) {
if !e.disableDefaultMetrics && server.master {
ch <- prometheus.MustNewConstMetric(versionDesc,
prometheus.UntypedValue, 1, versionString, semanticVersion.String())
}
Expand Down Expand Up @@ -1439,6 +1455,7 @@ func (e *Exporter) discoverDatabaseDSNs() []string {
continue
}

// If autoDiscoverDatabases is true, set first dsn as master database (Default: false)
server.master = true

databaseNames, err := queryDatabases(server)
Expand Down Expand Up @@ -1467,10 +1484,16 @@ func (e *Exporter) discoverDatabaseDSNs() []string {

func (e *Exporter) scrapeDSN(ch chan<- prometheus.Metric, dsn string) error {
server, err := e.servers.GetServer(dsn)

if err != nil {
return &ErrorConnectToServer{fmt.Sprintf("Error opening connection to database (%s): %s", loggableDSN(dsn), err.Error())}
}

// Check if autoDiscoverDatabases is false, set dsn as master database (Default: false)
if !e.autoDiscoverDatabases {
server.master = true
}

// Check if map versions need to be updated
if err := e.checkMapVersions(ch, server); err != nil {
log.Warnln("Proceeding with outdated query maps, as the Postgres version could not be determined:", err)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ func (s *IntegrationSuite) TestUnknownMetricParsingDoesntCrash(c *C) {
for k := range exporter.builtinMetricMaps {
emptyMaps[k] = intermediateMetricMap{
map[string]ColumnMapping{},
true,
0,
}
}
Expand Down
1 change: 1 addition & 0 deletions cmd/postgres_exporter/postgres_exporter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ func (s *FunctionalSuite) TestSemanticVersionColumnDiscard(c *C) {
"metric_which_stays": {COUNTER, "This metric should not be eliminated", nil, nil},
"metric_which_discards": {COUNTER, "This metric should be forced to DISCARD", nil, nil},
},
true,
0,
},
}
Expand Down
143 changes: 79 additions & 64 deletions queries.yaml
Original file line number Diff line number Diff line change
@@ -1,20 +1,25 @@
pg_replication:
query: "SELECT EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp())) as lag"
master: true
metrics:
- lag:
usage: "GAUGE"
description: "Replication lag behind master in seconds"

pg_postmaster:
query: "SELECT pg_postmaster_start_time as start_time_seconds from pg_postmaster_start_time()"
master: true
metrics:
- start_time_seconds:
usage: "GAUGE"
description: "Time at which postmaster started"

pg_stat_user_tables:
query: "SELECT schemaname, relname, seq_scan, seq_tup_read, idx_scan, idx_tup_fetch, n_tup_ins, n_tup_upd, n_tup_del, n_tup_hot_upd, n_live_tup, n_dead_tup, n_mod_since_analyze, COALESCE(last_vacuum, '1970-01-01Z'), COALESCE(last_vacuum, '1970-01-01Z') as last_vacuum, COALESCE(last_autovacuum, '1970-01-01Z') as last_autovacuum, COALESCE(last_analyze, '1970-01-01Z') as last_analyze, COALESCE(last_autoanalyze, '1970-01-01Z') as last_autoanalyze, vacuum_count, autovacuum_count, analyze_count, autoanalyze_count FROM pg_stat_user_tables"
query: "SELECT current_database() datname, schemaname, relname, seq_scan, seq_tup_read, idx_scan, idx_tup_fetch, n_tup_ins, n_tup_upd, n_tup_del, n_tup_hot_upd, n_live_tup, n_dead_tup, n_mod_since_analyze, COALESCE(last_vacuum, '1970-01-01Z'), COALESCE(last_vacuum, '1970-01-01Z') as last_vacuum, COALESCE(last_autovacuum, '1970-01-01Z') as last_autovacuum, COALESCE(last_analyze, '1970-01-01Z') as last_analyze, COALESCE(last_autoanalyze, '1970-01-01Z') as last_autoanalyze, vacuum_count, autovacuum_count, analyze_count, autoanalyze_count FROM pg_stat_user_tables"
metrics:
- datname:
usage: "LABEL"
description: "Name of current database"
- schemaname:
usage: "LABEL"
description: "Name of the schema that this table is in"
Expand Down Expand Up @@ -80,8 +85,11 @@ pg_stat_user_tables:
description: "Number of times this table has been analyzed by the autovacuum daemon"

pg_statio_user_tables:
query: "SELECT schemaname, relname, heap_blks_read, heap_blks_hit, idx_blks_read, idx_blks_hit, toast_blks_read, toast_blks_hit, tidx_blks_read, tidx_blks_hit FROM pg_statio_user_tables"
query: "SELECT current_database() datname, schemaname, relname, heap_blks_read, heap_blks_hit, idx_blks_read, idx_blks_hit, toast_blks_read, toast_blks_hit, tidx_blks_read, tidx_blks_hit FROM pg_statio_user_tables"
metrics:
- datname:
usage: "LABEL"
description: "Name of current database"
- schemaname:
usage: "LABEL"
description: "Name of the schema that this table is in"
Expand Down Expand Up @@ -115,6 +123,7 @@ pg_statio_user_tables:

pg_database:
query: "SELECT pg_database.datname, pg_database_size(pg_database.datname) as size FROM pg_database"
master: true
cache_seconds: 30
metrics:
- datname:
Expand All @@ -124,67 +133,73 @@ pg_database:
usage: "GAUGE"
description: "Disk space used by the database"


pg_stat_statements:
query: "SELECT query, calls, total_time / 1000 as total_time_seconds, min_time / 1000 as min_time_seconds, max_time / 1000 as max_time_seconds, mean_time / 1000 as mean_time_seconds, stddev_time / 1000 as stddev_time_seconds, rows, shared_blks_hit, shared_blks_read, shared_blks_dirtied, shared_blks_written, local_blks_hit, local_blks_read, local_blks_dirtied, local_blks_written, temp_blks_read, temp_blks_written, blk_read_time / 1000 as blk_read_time_seconds, blk_write_time / 1000 as blk_write_time_seconds FROM pg_stat_statements"
query: "SELECT t2.rolname, t3.datname, queryid, calls, total_time / 1000 as total_time_seconds, min_time / 1000 as min_time_seconds, max_time / 1000 as max_time_seconds, mean_time / 1000 as mean_time_seconds, stddev_time / 1000 as stddev_time_seconds, rows, shared_blks_hit, shared_blks_read, shared_blks_dirtied, shared_blks_written, local_blks_hit, local_blks_read, local_blks_dirtied, local_blks_written, temp_blks_read, temp_blks_written, blk_read_time / 1000 as blk_read_time_seconds, blk_write_time / 1000 as blk_write_time_seconds FROM pg_stat_statements t1 join pg_roles t2 on (t1.userid=t2.oid) join pg_database t3 on (t1.dbid=t3.oid)"
master: true
metrics:
- query:
usage: "LABEL"
description: "Query class"
- calls:
usage: "COUNTER"
description: "Number of times executed"
- total_time_seconds:
usage: "COUNTER"
description: "Total time spent in the statement, in milliseconds"
- min_time_seconds:
usage: "GAUGE"
description: "Minimum time spent in the statement, in milliseconds"
- max_time_seconds:
usage: "GAUGE"
description: "Maximum time spent in the statement, in milliseconds"
- mean_time_seconds:
usage: "GAUGE"
description: "Mean time spent in the statement, in milliseconds"
- stddev_time_seconds:
usage: "GAUGE"
description: "Population standard deviation of time spent in the statement, in milliseconds"
- rows:
usage: "COUNTER"
description: "Total number of rows retrieved or affected by the statement"
- shared_blks_hit:
usage: "COUNTER"
description: "Total number of shared block cache hits by the statement"
- shared_blks_read:
usage: "COUNTER"
description: "Total number of shared blocks read by the statement"
- shared_blks_dirtied:
usage: "COUNTER"
description: "Total number of shared blocks dirtied by the statement"
- shared_blks_written:
usage: "COUNTER"
description: "Total number of shared blocks written by the statement"
- local_blks_hit:
usage: "COUNTER"
description: "Total number of local block cache hits by the statement"
- local_blks_read:
usage: "COUNTER"
description: "Total number of local blocks read by the statement"
- local_blks_dirtied:
usage: "COUNTER"
description: "Total number of local blocks dirtied by the statement"
- local_blks_written:
usage: "COUNTER"
description: "Total number of local blocks written by the statement"
- temp_blks_read:
usage: "COUNTER"
description: "Total number of temp blocks read by the statement"
- temp_blks_written:
usage: "COUNTER"
description: "Total number of temp blocks written by the statement"
- blk_read_time_seconds:
usage: "COUNTER"
description: "Total time the statement spent reading blocks, in milliseconds (if track_io_timing is enabled, otherwise zero)"
- blk_write_time_seconds:
usage: "COUNTER"
description: "Total time the statement spent writing blocks, in milliseconds (if track_io_timing is enabled, otherwise zero)"
- rolname:
usage: "LABEL"
description: "Name of user"
- datname:
usage: "LABEL"
description: "Name of database"
- queryid:
usage: "LABEL"
description: "Query ID"
- calls:
usage: "COUNTER"
description: "Number of times executed"
- total_time_seconds:
usage: "COUNTER"
description: "Total time spent in the statement, in milliseconds"
- min_time_seconds:
usage: "GAUGE"
description: "Minimum time spent in the statement, in milliseconds"
- max_time_seconds:
usage: "GAUGE"
description: "Maximum time spent in the statement, in milliseconds"
- mean_time_seconds:
usage: "GAUGE"
description: "Mean time spent in the statement, in milliseconds"
- stddev_time_seconds:
usage: "GAUGE"
description: "Population standard deviation of time spent in the statement, in milliseconds"
- rows:
usage: "COUNTER"
description: "Total number of rows retrieved or affected by the statement"
- shared_blks_hit:
usage: "COUNTER"
description: "Total number of shared block cache hits by the statement"
- shared_blks_read:
usage: "COUNTER"
description: "Total number of shared blocks read by the statement"
- shared_blks_dirtied:
usage: "COUNTER"
description: "Total number of shared blocks dirtied by the statement"
- shared_blks_written:
usage: "COUNTER"
description: "Total number of shared blocks written by the statement"
- local_blks_hit:
usage: "COUNTER"
description: "Total number of local block cache hits by the statement"
- local_blks_read:
usage: "COUNTER"
description: "Total number of local blocks read by the statement"
- local_blks_dirtied:
usage: "COUNTER"
description: "Total number of local blocks dirtied by the statement"
- local_blks_written:
usage: "COUNTER"
description: "Total number of local blocks written by the statement"
- temp_blks_read:
usage: "COUNTER"
description: "Total number of temp blocks read by the statement"
- temp_blks_written:
usage: "COUNTER"
description: "Total number of temp blocks written by the statement"
- blk_read_time_seconds:
usage: "COUNTER"
description: "Total time the statement spent reading blocks, in milliseconds (if track_io_timing is enabled, otherwise zero)"
- blk_write_time_seconds:
usage: "COUNTER"
description: "Total time the statement spent writing blocks, in milliseconds (if track_io_timing is enabled, otherwise zero)"