satellite/repair: assign lower health to segments with pieces out of …

…placement Segments with pieces out of placement (which we'll call "POPs" here for brevity) need to be repaired as soon as possible, even if they are otherwise pretty healthy. We don't want to say that _all_ segments with POPs are higher priority than _all_ segments without POPs, but it should take a pretty severe danger to make a segment with fewer POPs be prioritized before a segment with more. This change accomplishes this requirement along with a wholesale refitting of the segment decay model, in hopes of addressing concerns that the calculation and its output are too opaque and too hard to understand. The new model, its appropriateness, and its application are explained in much more detail than they were in the old code. As a nice bonus, this model gives more reasonable health values when minPieces is very low. See https://github.com/storj/datascience/blob/6aa8d85/repair_and_durability/repairPriority/hypergeo.ipynb for more discussion of the mathematics of the model. Change-Id: I3fd541834a37a253a08ded60ef2d475244653dd5
storj · Feb 6, 2024 · 3075606 · 3075606
1 parent 26ffb74
commit 3075606
Show file tree

Hide file tree

Showing 3 changed files with 160 additions and 67 deletions.
diff --git a/satellite/repair/checker/observer.go b/satellite/repair/checker/observer.go
@@ -411,7 +411,7 @@ func (fork *observerFork) process(ctx context.Context, segment *rangedloop.Segme
 	stats.segmentStats.segmentAge.Observe(int64(segmentAge.Seconds()))
 
 	required, repairThreshold, successThreshold, _ := loadRedundancy(segment.Redundancy, fork.repairOverrides)
-	segmentHealth := repair.SegmentHealth(numHealthy, required, totalNumNodes, fork.nodeFailureRate)
+	segmentHealth := repair.SegmentHealth(numHealthy, required, totalNumNodes, fork.nodeFailureRate, piecesCheck.ForcingRepair.Count())
 	segmentHealthFloatVal.Observe(segmentHealth)
 	stats.segmentStats.segmentHealth.Observe(segmentHealth)
 

diff --git a/satellite/repair/priority.go b/satellite/repair/priority.go
@@ -5,60 +5,120 @@ package repair
 
 import "math"
 
-// SegmentHealth returns a value corresponding to the health of a segment in the
-// repair queue. Lower health segments should be repaired first.
+// segmentHealthNHD purports to find the number of days that a segment can be
+// expected to survive, with the given failureRate.
 //
-// This calculation purports to find the number of iterations for which a
-// segment can be expected to survive, with the given failureRate. The number of
-// iterations for the segment to survive (X) can be modeled with the negative
-// binomial distribution, with the number of pieces that must be lost as the
-// success threshold r, and the chance of losing a single piece in a round as
-// the trial success probability p.
+// The loss of nodes and pieces on the Storj network relative to a single
+// segment can be modeled as a bin holding differently colored balls, one for
+// each node on the network. Nodes holding pieces of our segment become red
+// balls, and all other nodes are blue balls. One by one, we reach in and remove
+// a ball from the bin at random, symbolizing that node going offline. If the
+// ball is blue, we count that as a success (our segment is unaffected). If the
+// ball is red, it is a failure (our segment has lost a piece). We want to know
+// how many draws it will take before some number of failures is reached (that
+// is, how long will it be before a segment loses too many pieces and is no
+// longer reconstructible, if we don't repair it along the way).
 //
-// First, we calculate the expected number of iterations for a segment to
-// survive if we were to lose exactly one node every iteration:
+// With this formulation, the problem is nearly identical to the situation
+// described by the negative hypergeometric distribution
+// (https://en.wikipedia.org/wiki/Negative_hypergeometric_distribution). It is
+// related to the negative binomial distribution
+// (https://en.wikipedia.org/wiki/Negative_binomial_distribution), but the NBD
+// deals with drawing balls from a bin with replacement, while the NHD deals
+// with drawing balls from a bin without replacement. Because we can't expect
+// lost nodes to come back, we don't put balls back into the bin once they are
+// drawn, so ours is a case of drawing without replacement. The negative
+// hypergeometric distribution more closely matches our problem, especially
+// around certain edge cases.
 //
-//	r = numHealthy - minPieces + 1
-//	p = (totalNodes - numHealthy) / totalNodes
-//	X ~ NB(r, p)
+// Do nodes tend to go offline one after another like a sequence of balls being
+// chosen from a bin? No, in reality node failures tend to happen in conjunction
+// with other failures. They are not independent occurrences. However, if we
+// have done a good enough job in declumping segments so that pieces tend to be
+// well distributed across unrelated nodes, then node failure patterns from the
+// point of view of a particular segment should be pretty indistinguishable from
+// random and independent. We have measured in the past what appeared to be a
+// fairly steady rate of node failure: each node has something like a 0.00005435
+// chance of going permanently offline on any given day. With a population size
+// of about 24k nodes, this gives us a mean time between failures (MTBF) for
+// nodes of about 18.5 hours. We can make the simplifying assumption that nodes
+// do in fact go offline at this rate.
 //
-// Then we take the mean of that distribution to use as our expected value,
-// which is pr/(1-p).
+// In our formulation of the model, the NHD parameters are:
 //
-// Finally, to get away from the "one node per iteration" simplification, we
-// just scale the magnitude of the iterations in the model so that there really
-// is one node being lost. For example, if our failureRate and totalNodes imply
-// a churn rate of 3 nodes per day, we just take 1/3 of a day and call that an
-// "iteration" for purposes of the model. To convert iterations in the model to
-// days, we divide the mean of the negative binomial distribution (X, above) by
-// the number of nodes that we estimate will churn in one day.
-func SegmentHealth(numHealthy, minPieces, totalNodes int, failureRate float64) float64 {
-	if totalNodes < minTotalNodes {
-		// this model gives wonky results when there are too few nodes; pretend
-		// there are more nodes than there really are so that the model gives
-		// sane repair priorities
-		totalNodes = minTotalNodes
-	}
-	churnPerRound := float64(totalNodes) * failureRate
-	if churnPerRound < minChurnPerRound {
-		// we artificially limit churnPerRound from going too low in cases
-		// where there are not many nodes, so that health values do not
-		// start to approach the floating point maximum
-		churnPerRound = minChurnPerRound
-	}
-	p := float64(totalNodes-numHealthy) / float64(totalNodes)
-	if p == 1.0 {
-		// floating point precision is insufficient to represent the difference
-		// from p to 1. there are too many nodes for this model, or else
-		// numHealthy is 0 somehow. we can't proceed with the normal calculation
-		// or we will divide by zero.
-		return math.Inf(1)
+//	N (the total number of balls) = totalNodes
+//	K (the number of balls considered successes) = totalNodes-numHealthy
+//	r (the number of failures until we're done) = numHealthy-minPieces+1
+//
+// Knowing this tells us how to calculate how many draws to expect segment decay
+// to take. The expected value of the negative hypergeometric distribution (the
+// mean value you would get if you tried the experiment enough times) is
+// r*K/(N-K+1).
+//
+// Now, knowing the number of draws doesn't immediately tell us how many _days_
+// of survival to expect. We use the failureRate parameter to get from _draws_
+// to _days_.
+//
+// We want to scale things so that one draw corresponds to one node failure.
+// All we need for that is the MTBF, the mean time between failures. One draw
+// can correspond to one MTBF interval. Since we know that failureRate is a
+// chance of failure per node per day, we can multiply it by totalNodes to get
+// the total number of node failures per day, and invert that value to get the
+// mean number of days per failure, and that is the MTBF.
+//
+// For more analysis of this model, see the Jupyter Notebook
+// repair_and_durability/repairPriority/hypergeo.ipynb in the storj/datascience
+// repository.
+func segmentHealthNHD(numHealthy, minPieces, totalNodes int, failureRate float64) float64 {
+	if numHealthy < minPieces {
+		// take a shortcut.
+		return 0
 	}
-	mean1 := float64(numHealthy-minPieces+1) * p / (1 - p)
-	return mean1 / churnPerRound
+	N := float64(totalNodes)                 // the total population
+	K := float64(totalNodes - numHealthy)    // the number of successes/blue balls in the bin
+	r := float64(numHealthy - minPieces + 1) // how many failures before the segment is irrecoverable
+
+	// the mean of the distribution, corresponding to the expected number of
+	// successes before we reach r failures
+	expectedNumberOfSuccesses := r * K / (N - K + 1)
+	// the total number of expected draws, including both successes and failures
+	expectedNumberOfDraws := expectedNumberOfSuccesses + r
+
+	drawsPerDay := N * failureRate
+	mtbf := 1 / drawsPerDay
+	days := expectedNumberOfDraws * mtbf
+
+	return days
 }
 
 const (
-	minChurnPerRound = 1e-10
-	minTotalNodes    = 100
+	// These somewhat magic-looking values correspond to pop-significance values
+	// suggested by @elek in the context of a different health model. They have
+	// been adapted to work in this model.
+	popSignificanceLow  = 3154 // from segmentHealthNHD(34, 29, 24000, 0.00005435)
+	popSignificanceHigh = 5385 // from segmentHealthNHD(40, 29, 24000, 0.00005435)
 )
+
+// SegmentHealth returns a value corresponding to the health of a segment in the
+// repair queue. Lower health segments should be repaired first.
+//
+// This implementation uses segmentHealthNHD to calculate the base health value.
+//
+// An additional wrinkle added here is that we need to assign high priority to
+// pieces which need to be repaired as soon as possible, e.g., pieces out of
+// placement ("POPs"). We want to tune it so that segments with POPs are
+// generally higher priority than other segments, and segments with more POPs
+// are generally higher priority than segments with fewer POPs. It is possible,
+// however, for a segment with no POPs to be prioritized above a segment that
+// does have POPs, if the first segment is in sufficient danger and the second
+// segment is not.
+func SegmentHealth(numHealthy, minPieces, totalNodes int, failureRate float64, numForcingRepair int) float64 {
+	base := segmentHealthNHD(numHealthy, minPieces, totalNodes, failureRate)
+
+	if numForcingRepair > 0 {
+		// POP segments are put between segments with lifetimes between popSignificanceLow and popSignificanceHigh days.
+		popSignificance := math.Min(float64(numForcingRepair)/float64(minPieces), 1)
+		return math.Min(base, popSignificanceHigh-(popSignificanceHigh-popSignificanceLow)*popSignificance)
+	}
+	return base
+}
diff --git a/satellite/repair/priority_test.go b/satellite/repair/priority_test.go
@@ -15,42 +15,75 @@ import (
 func TestSegmentHealth(t *testing.T) {
 	const failureRate = 0.01
 	assert.Less(t,
-		repair.SegmentHealth(11, 10, 10000, failureRate),
-		repair.SegmentHealth(10, 5, 10000, failureRate))
+		repair.SegmentHealth(11, 10, 10000, failureRate, 0),
+		repair.SegmentHealth(10, 5, 10000, failureRate, 0))
 	assert.Less(t,
-		repair.SegmentHealth(11, 10, 10000, failureRate),
-		repair.SegmentHealth(10, 9, 10000, failureRate))
+		repair.SegmentHealth(11, 10, 10000, failureRate, 0),
+		repair.SegmentHealth(10, 9, 10000, failureRate, 0))
 	assert.Less(t,
-		repair.SegmentHealth(10, 10, 10000, failureRate),
-		repair.SegmentHealth(9, 9, 10000, failureRate))
+		repair.SegmentHealth(10, 10, 10000, failureRate, 0),
+		repair.SegmentHealth(9, 9, 10000, failureRate, 0))
 	assert.Greater(t,
-		repair.SegmentHealth(11, 10, 10000, failureRate),
-		repair.SegmentHealth(12, 11, 10000, failureRate))
+		repair.SegmentHealth(11, 10, 10000, failureRate, 0),
+		repair.SegmentHealth(12, 11, 10000, failureRate, 0))
 	assert.Greater(t,
-		repair.SegmentHealth(13, 10, 10000, failureRate),
-		repair.SegmentHealth(12, 10, 10000, failureRate))
+		repair.SegmentHealth(13, 10, 10000, failureRate, 0),
+		repair.SegmentHealth(12, 10, 10000, failureRate, 0))
 }
 
 func TestSegmentHealthForDecayedSegment(t *testing.T) {
 	const failureRate = 0.01
-	got := repair.SegmentHealth(9, 10, 10000, failureRate)
+	got := repair.SegmentHealth(9, 10, 10000, failureRate, 0)
 	assert.Equal(t, float64(0), got)
 }
 
 func TestHighHealthAndLowFailureRate(t *testing.T) {
 	const failureRate = 0.00005435
 	assert.Less(t,
-		repair.SegmentHealth(36, 35, 10000, failureRate), math.Inf(1))
+		repair.SegmentHealth(36, 35, 10000, failureRate, 0),
+		math.Inf(1))
 	assert.Greater(t,
-		repair.SegmentHealth(36, 35, 10000, failureRate),
-		repair.SegmentHealth(35, 35, 10000, failureRate))
+		repair.SegmentHealth(36, 35, 10000, failureRate, 0),
+		repair.SegmentHealth(35, 35, 10000, failureRate, 0))
 	assert.Less(t,
-		repair.SegmentHealth(60, 29, 10000, failureRate), math.Inf(1))
+		repair.SegmentHealth(60, 29, 10000, failureRate, 0),
+		math.Inf(1))
 	assert.Greater(t,
-		repair.SegmentHealth(61, 29, 10000, failureRate),
-		repair.SegmentHealth(60, 29, 10000, failureRate))
+		repair.SegmentHealth(61, 29, 10000, failureRate, 0),
+		repair.SegmentHealth(60, 29, 10000, failureRate, 0))
 
 	assert.Greater(t,
-		repair.SegmentHealth(11, 10, 10000, failureRate),
-		repair.SegmentHealth(39, 34, 10000, failureRate))
+		repair.SegmentHealth(11, 10, 10000, failureRate, 0),
+		repair.SegmentHealth(39, 34, 10000, failureRate, 0))
+}
+
+func TestPiecesOutOfPlacementCauseHighPriority(t *testing.T) {
+	const failureRate = 0.00005435
+	// POPs existence means lower health
+	assert.Less(t,
+		repair.SegmentHealth(45, 29, 100000, failureRate, 1),
+		repair.SegmentHealth(45, 29, 100000, failureRate, 0))
+	// more POPs mean lower health than fewer POPs
+	assert.Less(t,
+		repair.SegmentHealth(45, 29, 100000, failureRate, 2),
+		repair.SegmentHealth(45, 29, 100000, failureRate, 1))
+	// segments in severe danger have lower health than much more healthy segments with POPs
+	assert.Less(t,
+		repair.SegmentHealth(30, 29, 100000, failureRate, 0),
+		repair.SegmentHealth(50, 29, 100000, failureRate, 1))
+	// a segment with POPs is less healthy than a segment without, even when the segment without has
+	// fewer healthy pieces, as long as the segment without is not in critical danger
+	assert.Less(t,
+		repair.SegmentHealth(56, 29, 100000, failureRate, 1),
+		repair.SegmentHealth(40, 29, 100000, failureRate, 0))
+	// health works as expected when segments have the same (nonzero) number of POPs
+	assert.Less(t,
+		repair.SegmentHealth(11, 10, 100000, failureRate, 1),
+		repair.SegmentHealth(10, 5, 100000, failureRate, 1))
+	assert.Less(t,
+		repair.SegmentHealth(11, 10, 10000, failureRate, 1),
+		repair.SegmentHealth(10, 9, 10000, failureRate, 1))
+	assert.Less(t,
+		repair.SegmentHealth(10, 10, 10000, failureRate, 1),
+		repair.SegmentHealth(9, 9, 10000, failureRate, 1))
 }