From 06c55d7d82e5a264e51572e350a483fe9bd3ea01 Mon Sep 17 00:00:00 2001
From: Brian Brazil <brian.brazil@robustperception.io>
Date: Sat, 10 Oct 2015 17:27:33 +0100
Subject: [PATCH] promql: Remove extrapolation from rate/increase/delta.

This is an improvement from both a theoretical and practical
standpoint.

Theory:
For simplicty I'll take the example of increase().

Counters are fundamentally lossy, if there's a counter reset
or instance failure between one scrape and the next we'll
lose information about the period up to the reset/failure.
Thus the samples we have allow us to calculate a lower-bound
on the increase() in a time period.

Extrapolation multiples this by an amount depending on timing which is
an upper bound based on what might have happened if everything continued
as it was.
This mix of upper and lower bounds means that we can't make any
meaningful statements about the output of increase() in relation to what
actually happened.

By removing the extrapolation, we can once again reason that the result
of increase() is a lower bound on the true increase of the counter.

Practical:
Fixes #581.
The extrapolation presumes that it's looking at a range within a
continuous stream of samples. If in fact the time series starts or
end within this range, this leads to an over-correction.

For discrete counters and gauges, extrapolating to invalid values in
their domain can be confusing and prevent rules being written that
depend on exact values.

For those looking to graph things more accurately irate() is a better
choice than extrapolation on rate().
For those looking to calculate how a gauge is trending deriv() is a
better choice than delta().
---
 promql/functions.go            | 78 ++++++++++++++--------------------
 promql/testdata/functions.test |  9 +++-
 promql/testdata/legacy.test    |  8 ++--
 3 files changed, 43 insertions(+), 52 deletions(-)

diff --git a/promql/functions.go b/promql/functions.go
index 8d2a2830775..4e47c412f4f 100644
--- a/promql/functions.go
+++ b/promql/functions.go
@@ -46,55 +46,15 @@ func funcTime(ev *evaluator, args Expressions) model.Value {
 
 // === delta(matrix model.ValMatrix) Vector ===
 func funcDelta(ev *evaluator, args Expressions) model.Value {
-	// This function still takes a 2nd argument for use by rate() and increase().
-	isCounter := len(args) >= 2 && ev.evalInt(args[1]) > 0
 	resultVector := vector{}
-
-	// If we treat these metrics as counters, we need to fetch all values
-	// in the interval to find breaks in the timeseries' monotonicity.
-	// I.e. if a counter resets, we want to ignore that reset.
-	var matrixValue matrix
-	if isCounter {
-		matrixValue = ev.evalMatrix(args[0])
-	} else {
-		matrixValue = ev.evalMatrixBounds(args[0])
-	}
-	for _, samples := range matrixValue {
+	for _, samples := range ev.evalMatrixBounds(args[0]) {
 		// No sense in trying to compute a delta without at least two points. Drop
 		// this vector element.
 		if len(samples.Values) < 2 {
 			continue
 		}
 
-		var (
-			counterCorrection model.SampleValue
-			lastValue         model.SampleValue
-		)
-		for _, sample := range samples.Values {
-			currentValue := sample.Value
-			if isCounter && currentValue < lastValue {
-				counterCorrection += lastValue - currentValue
-			}
-			lastValue = currentValue
-		}
-		resultValue := lastValue - samples.Values[0].Value + counterCorrection
-
-		targetInterval := args[0].(*MatrixSelector).Range
-		sampledInterval := samples.Values[len(samples.Values)-1].Timestamp.Sub(samples.Values[0].Timestamp)
-		if sampledInterval == 0 {
-			// Only found one sample. Cannot compute a rate from this.
-			continue
-		}
-		// Correct for differences in target vs. actual delta interval.
-		//
-		// Above, we didn't actually calculate the delta for the specified target
-		// interval, but for an interval between the first and last found samples
-		// under the target interval, which will usually have less time between
-		// them. Depending on how many samples are found under a target interval,
-		// the delta results are distorted and temporal aliasing occurs (ugly
-		// bumps). This effect is corrected for below.
-		intervalCorrection := model.SampleValue(targetInterval) / model.SampleValue(sampledInterval)
-		resultValue *= intervalCorrection
+		resultValue := samples.Values[len(samples.Values)-1].Value - samples.Values[0].Value
 
 		resultSample := &sample{
 			Metric:    samples.Metric,
@@ -109,8 +69,7 @@ func funcDelta(ev *evaluator, args Expressions) model.Value {
 
 // === rate(node model.ValMatrix) Vector ===
 func funcRate(ev *evaluator, args Expressions) model.Value {
-	args = append(args, &NumberLiteral{1})
-	vector := funcDelta(ev, args).(vector)
+	vector := funcIncrease(ev, args).(vector)
 
 	// TODO: could be other type of model.ValMatrix in the future (right now, only
 	// MatrixSelector exists). Find a better way of getting the duration of a
@@ -124,8 +83,35 @@ func funcRate(ev *evaluator, args Expressions) model.Value {
 
 // === increase(node model.ValMatrix) Vector ===
 func funcIncrease(ev *evaluator, args Expressions) model.Value {
-	args = append(args, &NumberLiteral{1})
-	return funcDelta(ev, args).(vector)
+	resultVector := vector{}
+	for _, samples := range ev.evalMatrix(args[0]) {
+		// No sense in trying to compute an increase without at least two points. Drop
+		// this vector element.
+		if len(samples.Values) < 2 {
+			continue
+		}
+
+		var (
+			counterCorrection model.SampleValue
+			lastValue         model.SampleValue
+		)
+		for _, sample := range samples.Values {
+			currentValue := sample.Value
+			if currentValue < lastValue {
+				counterCorrection += lastValue - currentValue
+			}
+			lastValue = currentValue
+		}
+		resultValue := lastValue - samples.Values[0].Value + counterCorrection
+		resultSample := &sample{
+			Metric:    samples.Metric,
+			Value:     resultValue,
+			Timestamp: ev.Timestamp,
+		}
+		resultSample.Metric.Del(model.MetricNameLabel)
+		resultVector = append(resultVector, resultSample)
+	}
+	return resultVector
 }
 
 // === irate(node model.ValMatrix) Vector ===
diff --git a/promql/testdata/functions.test b/promql/testdata/functions.test
index 3972053dfe8..d313ca3a00f 100644
--- a/promql/testdata/functions.test
+++ b/promql/testdata/functions.test
@@ -63,6 +63,11 @@ eval instant at 50m increase(http_requests[50m])
 	{path="/foo"} 100
 	{path="/bar"}  90
 
+# Tests for increase().
+eval instant at 51m increase(http_requests[50m])
+	{path="/foo"}  90
+	{path="/bar"}  80
+
 clear
 
 # Tests for irate().
@@ -87,10 +92,10 @@ load 5m
 	http_requests{job="app-server", instance="1", group="canary"}		0+80x10
 
 # deriv should return the same as rate in simple cases.
-eval instant at 50m rate(http_requests{group="canary", instance="1", job="app-server"}[60m])
+eval instant at 50m rate(http_requests{group="canary", instance="1", job="app-server"}[50m])
 	{group="canary", instance="1", job="app-server"} 0.26666666666666666
 
-eval instant at 50m deriv(http_requests{group="canary", instance="1", job="app-server"}[60m])
+eval instant at 50m deriv(http_requests{group="canary", instance="1", job="app-server"}[50m])
 	{group="canary", instance="1", job="app-server"} 0.26666666666666666
 
 # deriv should return correct result.
diff --git a/promql/testdata/legacy.test b/promql/testdata/legacy.test
index 01ca435c683..b7a428d7e0c 100644
--- a/promql/testdata/legacy.test
+++ b/promql/testdata/legacy.test
@@ -194,18 +194,18 @@ eval instant at 50m sum(http_requests) by (job) + min(http_requests) by (job) +
 	{job="api-server"} 1750 
 
 
-# Deltas should be adjusted for target interval vs. samples under target interval.
+# Deltas should not be adjusted for target interval vs. samples under target interval.
 eval instant at 50m delta(http_requests{group="canary", instance="1", job="app-server"}[18m])
-	{group="canary", instance="1", job="app-server"} 288
+	{group="canary", instance="1", job="app-server"} 240
 
 
 # Rates should calculate per-second rates.
 eval instant at 50m rate(http_requests{group="canary", instance="1", job="app-server"}[60m])
-	{group="canary", instance="1", job="app-server"} 0.26666666666666666
+	{group="canary", instance="1", job="app-server"} 0.2222222222222222
 
 # Counter resets at in the middle of range are handled correctly by rate().
 eval instant at 50m rate(testcounter_reset_middle[60m])
-	{} 0.03
+	{} 0.025
 
 
 # Counter resets at end of range are ignored by rate().