-
Notifications
You must be signed in to change notification settings - Fork 208
/
syncer.go
343 lines (319 loc) · 9.78 KB
/
syncer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
package atxsync
import (
"context"
"errors"
"fmt"
"time"
"go.uber.org/zap"
"golang.org/x/sync/errgroup"
"github.com/spacemeshos/go-spacemesh/common/types"
"github.com/spacemeshos/go-spacemesh/fetch"
"github.com/spacemeshos/go-spacemesh/log"
"github.com/spacemeshos/go-spacemesh/p2p"
"github.com/spacemeshos/go-spacemesh/p2p/pubsub"
"github.com/spacemeshos/go-spacemesh/sql"
"github.com/spacemeshos/go-spacemesh/sql/atxs"
"github.com/spacemeshos/go-spacemesh/sql/atxsync"
"github.com/spacemeshos/go-spacemesh/sql/localsql"
"github.com/spacemeshos/go-spacemesh/system"
)
//go:generate mockgen -typed -package=mocks -destination=./mocks/mocks.go -source=./syncer.go
type fetcher interface {
SelectBestShuffled(int) []p2p.Peer
PeerEpochInfo(context.Context, p2p.Peer, types.EpochID) (*fetch.EpochData, error)
system.AtxFetcher
}
type Opt func(*Syncer)
func WithLogger(logger *zap.Logger) Opt {
return func(s *Syncer) {
s.logger = logger
}
}
func DefaultConfig() Config {
return Config{
EpochInfoInterval: 4 * time.Hour,
AtxsBatch: 1000,
RequestsLimit: 20,
EpochInfoPeers: 2,
ProgressFraction: 0.1,
ProgressInterval: 20 * time.Minute,
}
}
type Config struct {
// EpochInfoInterval between epoch info requests to the network.
EpochInfoInterval time.Duration `mapstructure:"epoch-info-request-interval"`
// EpochInfoPeers is the number of peers we will ask for epoch info, every epoch info requests interval.
EpochInfoPeers int `mapstructure:"epoch-info-peers"`
// RequestsLimit is the maximum number of requests for single activation.
//
// The purpose of it is to prevent peers from advertising invalid atx and disappearing.
// Which will make node ask other peers for invalid atx.
// It will be reset to 0 once atx advertised again.
RequestsLimit int `mapstructure:"requests-limit"`
// AtxsBatch is the maximum number of atxs to sync in a single request.
AtxsBatch int `mapstructure:"atxs-batch"`
// ProgressFraction will report progress every fraction from total is downloaded.
ProgressFraction float64 `mapstructure:"progress-every-fraction"`
// ProgressInterval will report progress every interval.
ProgressInterval time.Duration `mapstructure:"progress-on-time"`
}
func WithConfig(cfg Config) Opt {
return func(s *Syncer) {
s.cfg = cfg
}
}
func New(fetcher fetcher, db sql.Executor, localdb *localsql.Database, opts ...Opt) *Syncer {
s := &Syncer{
logger: zap.NewNop(),
cfg: DefaultConfig(),
fetcher: fetcher,
db: db,
localdb: localdb,
}
for _, opt := range opts {
opt(s)
}
return s
}
type Syncer struct {
logger *zap.Logger
cfg Config
fetcher fetcher
db sql.Executor
localdb *localsql.Database
}
func (s *Syncer) Download(parent context.Context, publish types.EpochID, downloadUntil time.Time) error {
state, err := atxsync.GetSyncState(s.localdb, publish)
if err != nil {
return fmt.Errorf("failed to get state for epoch %v: %w", publish, err)
}
lastSuccess, total, downloaded, err := atxsync.GetRequest(s.localdb, publish)
if err != nil && !errors.Is(err, sql.ErrNotFound) {
return fmt.Errorf("failed to get last request time for epoch %v: %w", publish, err)
}
// in case of immediate we will request epoch info without waiting EpochInfoInterval
immediate := len(state) == 0 || (errors.Is(err, sql.ErrNotFound) || !lastSuccess.After(downloadUntil))
if !immediate && total == downloaded {
s.logger.Debug("sync for epoch was completed before", log.ZContext(parent), publish.Field().Zap())
return nil
}
s.logger.Info("starting atx sync", log.ZContext(parent), publish.Field().Zap())
ctx, cancel := context.WithCancel(parent)
eg, ctx := errgroup.WithContext(ctx)
updates := make(chan epochUpdate, s.cfg.EpochInfoPeers)
if len(state) == 0 {
state = map[types.ATXID]int{}
} else {
updates <- epochUpdate{time: lastSuccess, update: state}
}
// termination requires two conditions:
// - epoch info has to be successfully downloaded close to or after the epoch start
// - all atxs from that epoch have to be downloaded or they are unavailable.
// atx is unavailable if it was requested more than RequestsLimit times, and no peer provided it.
eg.Go(func() error {
return s.downloadEpochInfo(ctx, publish, immediate, updates)
})
eg.Go(func() error {
err := s.downloadAtxs(ctx, publish, downloadUntil, state, updates)
cancel()
return err
})
if err := eg.Wait(); err != nil {
return err
}
return parent.Err()
}
func (s *Syncer) downloadEpochInfo(
ctx context.Context,
publish types.EpochID,
immediate bool,
updates chan<- epochUpdate,
) error {
interval := s.cfg.EpochInfoInterval
if immediate {
interval = 0
}
for {
if interval != 0 {
s.logger.Debug(
"waiting between epoch info requests",
publish.Field().Zap(),
zap.Duration("duration", interval),
)
}
select {
case <-ctx.Done():
return nil
// TODO(dshulyak) this has to be randomized in a followup
// when sync will be schedulled in advance, in order to smooth out request rate across the network
case <-time.After(interval):
}
peers := s.fetcher.SelectBestShuffled(s.cfg.EpochInfoPeers)
if len(peers) == 0 {
return errors.New("no peers available")
}
// do not run it concurrently, epoch info is large and will continue to grow
for _, peer := range peers {
epochData, err := s.fetcher.PeerEpochInfo(ctx, peer, publish)
if err != nil || epochData == nil {
if errors.Is(err, context.Canceled) {
return nil
}
s.logger.Warn("failed to download epoch info",
log.ZContext(ctx),
publish.Field().Zap(),
zap.String("peer", peer.String()),
zap.Error(err),
)
continue
}
s.logger.Info("downloaded epoch info",
log.ZContext(ctx),
publish.Field().Zap(),
zap.String("peer", peer.String()),
zap.Int("atxs", len(epochData.AtxIDs)),
)
// adding hashes to fetcher is not useful as they overflow the cache and are not used
// so we switch to asking best peers immediately
update := make(map[types.ATXID]int, len(epochData.AtxIDs))
for _, atx := range epochData.AtxIDs {
update[atx] = 0
}
select {
case <-ctx.Done():
return nil
case updates <- epochUpdate{time: time.Now(), update: update}:
}
// after first success switch to requests after interval
interval = s.cfg.EpochInfoInterval
}
}
}
func (s *Syncer) downloadAtxs(
ctx context.Context,
publish types.EpochID,
downloadUntil time.Time,
state map[types.ATXID]int,
updates <-chan epochUpdate,
) error {
var (
batch = make([]types.ATXID, 0, s.cfg.AtxsBatch)
downloaded = map[types.ATXID]bool{}
previouslyDownloaded = 0
start = time.Now()
lastSuccess time.Time
progressTimestamp = start
nothingToDownload = len(state) == 0
)
for {
// waiting for update if there is nothing to download
if nothingToDownload && lastSuccess.After(downloadUntil) {
s.logger.Info(
"atx sync completed",
log.ZContext(ctx),
publish.Field().Zap(),
zap.Int("downloaded", len(downloaded)),
zap.Int("total", len(state)),
zap.Int("unavailable", len(state)-len(downloaded)),
zap.Duration("duration", time.Since(start)),
)
return nil
}
if nothingToDownload {
select {
case <-ctx.Done():
return nil
case update := <-updates:
lastSuccess = update.time
for atx, count := range update.update {
state[atx] = count
}
}
} else {
// otherwise check updates periodically but don't stop downloading
select {
case <-ctx.Done():
return nil
case update := <-updates:
lastSuccess = update.time
for atx, count := range update.update {
state[atx] = count
}
default:
}
}
for atx, requests := range state {
if downloaded[atx] {
continue
}
exists, err := atxs.Has(s.db, atx)
if err != nil {
return err
}
if exists {
downloaded[atx] = true
continue
}
if requests >= s.cfg.RequestsLimit {
delete(state, atx)
continue
}
batch = append(batch, atx)
if len(batch) == cap(batch) {
break
}
}
nothingToDownload = len(batch) == 0
if progress := float64(len(downloaded) - previouslyDownloaded); progress/float64(
len(state),
) > s.cfg.ProgressFraction && s.cfg.ProgressFraction != 0 ||
time.Since(progressTimestamp) > s.cfg.ProgressInterval && s.cfg.ProgressInterval != 0 {
s.logger.Info(
"atx sync progress",
log.ZContext(ctx),
publish.Field().Zap(),
zap.Int("downloaded", len(downloaded)),
zap.Int("total", len(state)),
zap.Int("progress", int(progress)),
zap.Float64("rate per sec", progress/time.Since(progressTimestamp).Seconds()),
)
previouslyDownloaded = len(downloaded)
progressTimestamp = time.Now()
}
if len(batch) > 0 {
if err := s.fetcher.GetAtxs(ctx, batch); err != nil {
if errors.Is(err, context.Canceled) {
return nil
}
s.logger.Debug("failed to download atxs", log.ZContext(ctx), log.NiceZapError(err))
batchError := &fetch.BatchError{}
if errors.As(err, &batchError) {
for hash, err := range batchError.Errors {
if _, exists := state[types.ATXID(hash)]; !exists {
continue
}
if errors.Is(err, fetch.ErrExceedMaxRetries) {
state[types.ATXID(hash)]++
} else if errors.Is(err, pubsub.ErrValidationReject) {
state[types.ATXID(hash)] = s.cfg.RequestsLimit
}
}
}
}
}
if err := s.localdb.WithTx(context.Background(), func(tx *sql.Tx) error {
err := atxsync.SaveRequest(tx, publish, lastSuccess, int64(len(state)), int64(len(downloaded)))
if err != nil {
return fmt.Errorf("failed to save request time: %w", err)
}
return atxsync.SaveSyncState(tx, publish, state, s.cfg.RequestsLimit)
}); err != nil {
return fmt.Errorf("failed to persist state for epoch %v: %w", publish, err)
}
batch = batch[:0]
}
}
type epochUpdate struct {
time time.Time
update map[types.ATXID]int
}