Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add scrape_timeout_seconds metric (behind feature flag) #9247

Merged
merged 1 commit into from Sep 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 6 additions & 2 deletions cmd/prometheus/main.go
Expand Up @@ -107,6 +107,7 @@ type flagConfig struct {
outageTolerance model.Duration
resendDelay model.Duration
web web.Options
scrape scrape.Options
tsdb tsdbOptions
lookbackDelta model.Duration
webTimeout model.Duration
Expand Down Expand Up @@ -152,6 +153,9 @@ func (c *flagConfig) setFeatureListOptions(logger log.Logger) error {
case "memory-snapshot-on-shutdown":
c.tsdb.EnableMemorySnapshotOnShutdown = true
level.Info(logger).Log("msg", "Experimental memory snapshot on shutdown enabled")
case "extra-scrape-metrics":
c.scrape.ExtraMetrics = true
level.Info(logger).Log("msg", "Experimental additional scrape metrics")
case "":
continue
default:
Expand Down Expand Up @@ -312,7 +316,7 @@ func main() {
a.Flag("query.max-samples", "Maximum number of samples a single query can load into memory. Note that queries will fail if they try to load more samples than this into memory, so this also limits the number of samples a query can return.").
Default("50000000").IntVar(&cfg.queryMaxSamples)

a.Flag("enable-feature", "Comma separated feature names to enable. Valid options: exemplar-storage, expand-external-labels, memory-snapshot-on-shutdown, promql-at-modifier, promql-negative-offset, remote-write-receiver. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details.").
a.Flag("enable-feature", "Comma separated feature names to enable. Valid options: exemplar-storage, expand-external-labels, memory-snapshot-on-shutdown, promql-at-modifier, promql-negative-offset, remote-write-receiver, extra-scrape-metrics. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details.").
Default("").StringsVar(&cfg.featureList)

promlogflag.AddFlags(a, &cfg.promlogConfig)
Expand Down Expand Up @@ -457,7 +461,7 @@ func main() {
ctxNotify, cancelNotify = context.WithCancel(context.Background())
discoveryManagerNotify = discovery.NewManager(ctxNotify, log.With(logger, "component", "discovery manager notify"), discovery.Name("notify"))

scrapeManager = scrape.NewManager(log.With(logger, "component", "scrape manager"), fanoutStorage)
scrapeManager = scrape.NewManager(&cfg.scrape, log.With(logger, "component", "scrape manager"), fanoutStorage)

opts = promql.EngineOpts{
Logger: log.With(logger, "component", "query engine"),
Expand Down
8 changes: 8 additions & 0 deletions docs/feature_flags.md
Expand Up @@ -61,3 +61,11 @@ Exemplar storage is implemented as a fixed size circular buffer that stores exem
This takes the snapshot of the chunks that are in memory along with the series information when shutting down and stores
it on disk. This will reduce the startup time since the memory state can be restored with this snapshot and m-mapped
chunks without the need of WAL replay.

## Extra Scrape Metrics

`--enable-feature=extra-scrape-metrics`

When enabled, for each instance scrape, Prometheus stores a sample in the following additional time series:

* `scrape_timeout_seconds`. The configured `scrape_timeout` for a target. This allows you to measure each target to find out how close they are to timing out with `scrape_duration_seconds / scrape_timeout_seconds`.
14 changes: 12 additions & 2 deletions scrape/manager.go
Expand Up @@ -99,12 +99,16 @@ func (mc *MetadataMetricsCollector) Collect(ch chan<- prometheus.Metric) {
}

// NewManager is the Manager constructor
func NewManager(logger log.Logger, app storage.Appendable) *Manager {
func NewManager(o *Options, logger log.Logger, app storage.Appendable) *Manager {
if o == nil {
o = &Options{}
}
if logger == nil {
logger = log.NewNopLogger()
}
SuperQ marked this conversation as resolved.
Show resolved Hide resolved
m := &Manager{
append: app,
opts: o,
logger: logger,
scrapeConfigs: make(map[string]*config.ScrapeConfig),
scrapePools: make(map[string]*scrapePool),
Expand All @@ -116,9 +120,15 @@ func NewManager(logger log.Logger, app storage.Appendable) *Manager {
return m
}

// Options are the configuration parameters to the scrape manager.
type Options struct {
ExtraMetrics bool
}

// Manager maintains a set of scrape pools and manages start/stop cycles
// when receiving new target groups from the discovery manager.
type Manager struct {
opts *Options
logger log.Logger
append storage.Appendable
graceShut chan struct{}
Expand Down Expand Up @@ -181,7 +191,7 @@ func (m *Manager) reload() {
level.Error(m.logger).Log("msg", "error reloading target set", "err", "invalid config id:"+setName)
continue
}
sp, err := newScrapePool(scrapeConfig, m.append, m.jitterSeed, log.With(m.logger, "scrape_pool", setName))
sp, err := newScrapePool(scrapeConfig, m.append, m.jitterSeed, log.With(m.logger, "scrape_pool", setName), m.opts.ExtraMetrics)
if err != nil {
level.Error(m.logger).Log("msg", "error creating new scrape pool", "err", err, "scrape_pool", setName)
continue
Expand Down
9 changes: 6 additions & 3 deletions scrape/manager_test.go
Expand Up @@ -398,7 +398,8 @@ scrape_configs:
ch = make(chan struct{}, 1)
)

scrapeManager := NewManager(nil, nil)
opts := Options{}
scrapeManager := NewManager(&opts, nil, nil)
newLoop := func(scrapeLoopOptions) loop {
ch <- struct{}{}
return noopLoop()
Expand Down Expand Up @@ -460,7 +461,8 @@ scrape_configs:
}

func TestManagerTargetsUpdates(t *testing.T) {
m := NewManager(nil, nil)
opts := Options{}
m := NewManager(&opts, nil, nil)

ts := make(chan map[string][]*targetgroup.Group)
go m.Run(ts)
Expand Down Expand Up @@ -512,7 +514,8 @@ global:
return cfg
}

scrapeManager := NewManager(nil, nil)
opts := Options{}
scrapeManager := NewManager(&opts, nil, nil)

// Load the first config.
cfg1 := getConfig("ha1")
Expand Down
22 changes: 19 additions & 3 deletions scrape/scrape.go
Expand Up @@ -263,7 +263,7 @@ const maxAheadTime = 10 * time.Minute

type labelsMutator func(labels.Labels) labels.Labels

func newScrapePool(cfg *config.ScrapeConfig, app storage.Appendable, jitterSeed uint64, logger log.Logger) (*scrapePool, error) {
func newScrapePool(cfg *config.ScrapeConfig, app storage.Appendable, jitterSeed uint64, logger log.Logger, reportScrapeTimeout bool) (*scrapePool, error) {
SuperQ marked this conversation as resolved.
Show resolved Hide resolved
targetScrapePools.Inc()
if logger == nil {
logger = log.NewNopLogger()
Expand Down Expand Up @@ -311,6 +311,7 @@ func newScrapePool(cfg *config.ScrapeConfig, app storage.Appendable, jitterSeed
opts.labelLimits,
opts.interval,
opts.timeout,
reportScrapeTimeout,
)
}

Expand Down Expand Up @@ -828,6 +829,8 @@ type scrapeLoop struct {
stopped chan struct{}

disabledEndOfRunStalenessMarkers bool

reportScrapeTimeout bool
}

// scrapeCache tracks mappings of exposed metric strings to label sets and
Expand Down Expand Up @@ -1087,6 +1090,7 @@ func newScrapeLoop(ctx context.Context,
labelLimits *labelLimits,
interval time.Duration,
timeout time.Duration,
reportScrapeTimeout bool,
) *scrapeLoop {
if l == nil {
l = log.NewNopLogger()
Expand All @@ -1112,6 +1116,7 @@ func newScrapeLoop(ctx context.Context,
labelLimits: labelLimits,
interval: interval,
timeout: timeout,
reportScrapeTimeout: reportScrapeTimeout,
}
sl.ctx, sl.cancel = context.WithCancel(ctx)

Expand Down Expand Up @@ -1216,7 +1221,7 @@ func (sl *scrapeLoop) scrapeAndReport(interval, timeout time.Duration, last, app
}()

defer func() {
if err = sl.report(app, appendTime, time.Since(start), total, added, seriesAdded, scrapeErr); err != nil {
if err = sl.report(app, appendTime, timeout, time.Since(start), total, added, seriesAdded, scrapeErr); err != nil {
level.Warn(sl.l).Log("msg", "Appending scrape report failed", "err", err)
}
}()
Expand Down Expand Up @@ -1604,9 +1609,10 @@ const (
scrapeSamplesMetricName = "scrape_samples_scraped" + "\xff"
samplesPostRelabelMetricName = "scrape_samples_post_metric_relabeling" + "\xff"
scrapeSeriesAddedMetricName = "scrape_series_added" + "\xff"
scrapeTimeoutMetricName = "scrape_timeout_seconds" + "\xff"
)

func (sl *scrapeLoop) report(app storage.Appender, start time.Time, duration time.Duration, scraped, added, seriesAdded int, scrapeErr error) (err error) {
func (sl *scrapeLoop) report(app storage.Appender, start time.Time, timeout, duration time.Duration, scraped, added, seriesAdded int, scrapeErr error) (err error) {
sl.scraper.Report(start, duration, scrapeErr)

ts := timestamp.FromTime(start)
Expand All @@ -1631,6 +1637,11 @@ func (sl *scrapeLoop) report(app storage.Appender, start time.Time, duration tim
if err = sl.addReportSample(app, scrapeSeriesAddedMetricName, ts, float64(seriesAdded)); err != nil {
return
}
if sl.reportScrapeTimeout {
if err = sl.addReportSample(app, scrapeTimeoutMetricName, ts, timeout.Seconds()); err != nil {
return
}
}
return
}

Expand All @@ -1654,6 +1665,11 @@ func (sl *scrapeLoop) reportStale(app storage.Appender, start time.Time) (err er
if err = sl.addReportSample(app, scrapeSeriesAddedMetricName, ts, stale); err != nil {
return
}
if sl.reportScrapeTimeout {
if err = sl.addReportSample(app, scrapeTimeoutMetricName, ts, stale); err != nil {
return
}
}
return
}

Expand Down