promql: Remove extrapolation from rate/increase/delta.

This is an improvement from both a theoretical and practical standpoint. Theory: For simplicty I'll take the example of increase(). Counters are fundamentally lossy, if there's a counter reset or instance failure between one scrape and the next we'll lose information about the period up to the reset/failure. Thus the samples we have allow us to calculate a lower-bound on the increase() in a time period. Extrapolation multiples this by an amount depending on timing which is an upper bound based on what might have happened if everything continued as it was. This mix of upper and lower bounds means that we can't make any meaningful statements about the output of increase() in relation to what actually happened. By removing the extrapolation, we can once again reason that the result of increase() is a lower bound on the true increase of the counter. Practical: Fixes #581. The extrapolation presumes that it's looking at a range within a continuous stream of samples. If in fact the time series starts or end within this range, this leads to an over-correction. For discrete counters and gauges, extrapolating to invalid values in their domain can be confusing and prevent rules being written that depend on exact values. For those looking to graph things more accurately irate() is a better choice than extrapolation on rate(). For those looking to calculate how a gauge is trending deriv() is a better choice than delta().
prometheus · Oct 11, 2015 · 06c55d7 · 06c55d7
1 parent bbdfb10
commit 06c55d7
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 52 deletions.
diff --git a/promql/functions.go b/promql/functions.go
@@ -46,55 +46,15 @@ func funcTime(ev *evaluator, args Expressions) model.Value {
 
 // === delta(matrix model.ValMatrix) Vector ===
 func funcDelta(ev *evaluator, args Expressions) model.Value {
-	// This function still takes a 2nd argument for use by rate() and increase().
-	isCounter := len(args) >= 2 && ev.evalInt(args[1]) > 0
 	resultVector := vector{}
-
-	// If we treat these metrics as counters, we need to fetch all values
-	// in the interval to find breaks in the timeseries' monotonicity.
-	// I.e. if a counter resets, we want to ignore that reset.
-	var matrixValue matrix
-	if isCounter {
-		matrixValue = ev.evalMatrix(args[0])
-	} else {
-		matrixValue = ev.evalMatrixBounds(args[0])
-	}
-	for _, samples := range matrixValue {
+	for _, samples := range ev.evalMatrixBounds(args[0]) {
 		// No sense in trying to compute a delta without at least two points. Drop
 		// this vector element.
 		if len(samples.Values) < 2 {
 			continue
 		}
 
-		var (
-			counterCorrection model.SampleValue
-			lastValue         model.SampleValue
-		)
-		for _, sample := range samples.Values {
-			currentValue := sample.Value
-			if isCounter && currentValue < lastValue {
-				counterCorrection += lastValue - currentValue
-			}
-			lastValue = currentValue
-		}
-		resultValue := lastValue - samples.Values[0].Value + counterCorrection
-
-		targetInterval := args[0].(*MatrixSelector).Range
-		sampledInterval := samples.Values[len(samples.Values)-1].Timestamp.Sub(samples.Values[0].Timestamp)
-		if sampledInterval == 0 {
-			// Only found one sample. Cannot compute a rate from this.
-			continue
-		}
-		// Correct for differences in target vs. actual delta interval.
-		//
-		// Above, we didn't actually calculate the delta for the specified target
-		// interval, but for an interval between the first and last found samples
-		// under the target interval, which will usually have less time between
-		// them. Depending on how many samples are found under a target interval,
-		// the delta results are distorted and temporal aliasing occurs (ugly
-		// bumps). This effect is corrected for below.
-		intervalCorrection := model.SampleValue(targetInterval) / model.SampleValue(sampledInterval)
-		resultValue *= intervalCorrection
+		resultValue := samples.Values[len(samples.Values)-1].Value - samples.Values[0].Value
 
 		resultSample := &sample{
 			Metric:    samples.Metric,
@@ -109,8 +69,7 @@ func funcDelta(ev *evaluator, args Expressions) model.Value {
 
 // === rate(node model.ValMatrix) Vector ===
 func funcRate(ev *evaluator, args Expressions) model.Value {
-	args = append(args, &NumberLiteral{1})
-	vector := funcDelta(ev, args).(vector)
+	vector := funcIncrease(ev, args).(vector)
 
 	// TODO: could be other type of model.ValMatrix in the future (right now, only
 	// MatrixSelector exists). Find a better way of getting the duration of a
@@ -124,8 +83,35 @@ func funcRate(ev *evaluator, args Expressions) model.Value {
 
 // === increase(node model.ValMatrix) Vector ===
 func funcIncrease(ev *evaluator, args Expressions) model.Value {
-	args = append(args, &NumberLiteral{1})
-	return funcDelta(ev, args).(vector)
+	resultVector := vector{}
+	for _, samples := range ev.evalMatrix(args[0]) {
+		// No sense in trying to compute an increase without at least two points. Drop
+		// this vector element.
+		if len(samples.Values) < 2 {
+			continue
+		}
+
+		var (
+			counterCorrection model.SampleValue
+			lastValue         model.SampleValue
+		)
+		for _, sample := range samples.Values {
+			currentValue := sample.Value
+			if currentValue < lastValue {
+				counterCorrection += lastValue - currentValue
+			}
+			lastValue = currentValue
+		}
+		resultValue := lastValue - samples.Values[0].Value + counterCorrection
+		resultSample := &sample{
+			Metric:    samples.Metric,
+			Value:     resultValue,
+			Timestamp: ev.Timestamp,
+		}
+		resultSample.Metric.Del(model.MetricNameLabel)
+		resultVector = append(resultVector, resultSample)
+	}
+	return resultVector
 }
 
 // === irate(node model.ValMatrix) Vector ===

diff --git a/promql/testdata/functions.test b/promql/testdata/functions.test
@@ -63,6 +63,11 @@ eval instant at 50m increase(http_requests[50m])
 	{path="/foo"} 100
 	{path="/bar"}  90
 
+# Tests for increase().
+eval instant at 51m increase(http_requests[50m])
+	{path="/foo"}  90
+	{path="/bar"}  80
+
 clear
 
 # Tests for irate().
@@ -87,10 +92,10 @@ load 5m
 	http_requests{job="app-server", instance="1", group="canary"}		0+80x10
 
 # deriv should return the same as rate in simple cases.
-eval instant at 50m rate(http_requests{group="canary", instance="1", job="app-server"}[60m])
+eval instant at 50m rate(http_requests{group="canary", instance="1", job="app-server"}[50m])
 	{group="canary", instance="1", job="app-server"} 0.26666666666666666
 
-eval instant at 50m deriv(http_requests{group="canary", instance="1", job="app-server"}[60m])
+eval instant at 50m deriv(http_requests{group="canary", instance="1", job="app-server"}[50m])
 	{group="canary", instance="1", job="app-server"} 0.26666666666666666
 
 # deriv should return correct result.

diff --git a/promql/testdata/legacy.test b/promql/testdata/legacy.test
@@ -194,18 +194,18 @@ eval instant at 50m sum(http_requests) by (job) + min(http_requests) by (job) +
 	{job="api-server"} 1750 
 
 
-# Deltas should be adjusted for target interval vs. samples under target interval.
+# Deltas should not be adjusted for target interval vs. samples under target interval.
 eval instant at 50m delta(http_requests{group="canary", instance="1", job="app-server"}[18m])
-	{group="canary", instance="1", job="app-server"} 288
+	{group="canary", instance="1", job="app-server"} 240
 
 
 # Rates should calculate per-second rates.
 eval instant at 50m rate(http_requests{group="canary", instance="1", job="app-server"}[60m])
-	{group="canary", instance="1", job="app-server"} 0.26666666666666666
+	{group="canary", instance="1", job="app-server"} 0.2222222222222222
 
 # Counter resets at in the middle of range are handled correctly by rate().
 eval instant at 50m rate(testcounter_reset_middle[60m])
-	{} 0.03
+	{} 0.025
 
 
 # Counter resets at end of range are ignored by rate().