-
Notifications
You must be signed in to change notification settings - Fork 1k
/
pool.go
162 lines (147 loc) · 5.08 KB
/
pool.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
package backfill
import (
"context"
"math"
"time"
"github.com/libp2p/go-libp2p/core/peer"
"github.com/pkg/errors"
"github.com/prysmaticlabs/prysm/v5/beacon-chain/db/filesystem"
"github.com/prysmaticlabs/prysm/v5/beacon-chain/p2p"
"github.com/prysmaticlabs/prysm/v5/beacon-chain/p2p/peers"
"github.com/prysmaticlabs/prysm/v5/beacon-chain/startup"
"github.com/prysmaticlabs/prysm/v5/beacon-chain/sync"
"github.com/prysmaticlabs/prysm/v5/beacon-chain/verification"
"github.com/prysmaticlabs/prysm/v5/consensus-types/primitives"
log "github.com/sirupsen/logrus"
)
type batchWorkerPool interface {
spawn(ctx context.Context, n int, clock *startup.Clock, a PeerAssigner, v *verifier, cm sync.ContextByteVersions, blobVerifier verification.NewBlobVerifier, bfs *filesystem.BlobStorage)
todo(b batch)
complete() (batch, error)
}
type worker interface {
run(context.Context)
}
type newWorker func(id workerId, in, out chan batch, c *startup.Clock, v *verifier, cm sync.ContextByteVersions, nbv verification.NewBlobVerifier, bfs *filesystem.BlobStorage) worker
func defaultNewWorker(p p2p.P2P) newWorker {
return func(id workerId, in, out chan batch, c *startup.Clock, v *verifier, cm sync.ContextByteVersions, nbv verification.NewBlobVerifier, bfs *filesystem.BlobStorage) worker {
return newP2pWorker(id, p, in, out, c, v, cm, nbv, bfs)
}
}
type p2pBatchWorkerPool struct {
maxBatches int
newWorker newWorker
toWorkers chan batch
fromWorkers chan batch
toRouter chan batch
fromRouter chan batch
shutdownErr chan error
endSeq []batch
ctx context.Context
cancel func()
}
var _ batchWorkerPool = &p2pBatchWorkerPool{}
func newP2PBatchWorkerPool(p p2p.P2P, maxBatches int) *p2pBatchWorkerPool {
nw := defaultNewWorker(p)
return &p2pBatchWorkerPool{
newWorker: nw,
toRouter: make(chan batch, maxBatches),
fromRouter: make(chan batch, maxBatches),
toWorkers: make(chan batch),
fromWorkers: make(chan batch),
maxBatches: maxBatches,
shutdownErr: make(chan error),
}
}
func (p *p2pBatchWorkerPool) spawn(ctx context.Context, n int, c *startup.Clock, a PeerAssigner, v *verifier, cm sync.ContextByteVersions, nbv verification.NewBlobVerifier, bfs *filesystem.BlobStorage) {
p.ctx, p.cancel = context.WithCancel(ctx)
go p.batchRouter(a)
for i := 0; i < n; i++ {
go p.newWorker(workerId(i), p.toWorkers, p.fromWorkers, c, v, cm, nbv, bfs).run(p.ctx)
}
}
func (p *p2pBatchWorkerPool) todo(b batch) {
// Intercept batchEndSequence batches so workers can remain unaware of this state.
// Workers don't know what to do with batchEndSequence batches. They are a signal to the pool that the batcher
// has stopped producing things for the workers to do and the pool is close to winding down. See complete()
// to understand how the pool manages the state where all workers are idle
// and all incoming batches signal end of sequence.
if b.state == batchEndSequence {
p.endSeq = append(p.endSeq, b)
return
}
p.toRouter <- b
}
func (p *p2pBatchWorkerPool) complete() (batch, error) {
if len(p.endSeq) == p.maxBatches {
return p.endSeq[0], errEndSequence
}
select {
case b := <-p.fromRouter:
return b, nil
case err := <-p.shutdownErr:
return batch{}, errors.Wrap(err, "fatal error from backfill worker pool")
case <-p.ctx.Done():
log.WithError(p.ctx.Err()).Info("p2pBatchWorkerPool context canceled, shutting down")
return batch{}, p.ctx.Err()
}
}
func (p *p2pBatchWorkerPool) batchRouter(pa PeerAssigner) {
busy := make(map[peer.ID]bool)
todo := make([]batch, 0)
rt := time.NewTicker(time.Second)
earliest := primitives.Slot(math.MaxUint64)
for {
select {
case b := <-p.toRouter:
todo = append(todo, b)
// sort batches in descending order so that we'll always process the dependent batches first
sortBatchDesc(todo)
case <-rt.C:
// Worker assignments can fail if assignBatch can't find a suitable peer.
// This ticker exists to periodically break out of the channel select
// to retry failed assignments.
case b := <-p.fromWorkers:
pid := b.busy
busy[pid] = false
if b.state == batchBlobSync {
todo = append(todo, b)
sortBatchDesc(todo)
} else {
p.fromRouter <- b
}
case <-p.ctx.Done():
log.WithError(p.ctx.Err()).Info("p2pBatchWorkerPool context canceled, shutting down")
p.shutdown(p.ctx.Err())
return
}
if len(todo) == 0 {
continue
}
// Try to assign as many outstanding batches as possible to peers and feed the assigned batches to workers.
assigned, err := pa.Assign(busy, len(todo))
if err != nil {
if errors.Is(err, peers.ErrInsufficientSuitable) {
// Transient error resulting from insufficient number of connected peers. Leave batches in
// queue and get to them whenever the peer situation is resolved.
continue
}
p.shutdown(err)
return
}
for _, pid := range assigned {
busy[pid] = true
todo[0].busy = pid
p.toWorkers <- todo[0].withPeer(pid)
if todo[0].begin < earliest {
earliest = todo[0].begin
oldestBatch.Set(float64(earliest))
}
todo = todo[1:]
}
}
}
func (p *p2pBatchWorkerPool) shutdown(err error) {
p.cancel()
p.shutdownErr <- err
}