-
Notifications
You must be signed in to change notification settings - Fork 211
/
verifying_algorithm.go
226 lines (203 loc) · 8.94 KB
/
verifying_algorithm.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
package tortoise
import (
"context"
"errors"
"fmt"
"sync"
"time"
"github.com/spacemeshos/go-spacemesh/common/types"
"github.com/spacemeshos/go-spacemesh/database"
"github.com/spacemeshos/go-spacemesh/log"
"github.com/spacemeshos/go-spacemesh/mesh"
)
// ThreadSafeVerifyingTortoise is a thread safe verifying tortoise wrapper, it just locks all actions.
type ThreadSafeVerifyingTortoise struct {
trtl *turtle
logger log.Log
lastRerun time.Time
mutex sync.RWMutex
}
// Config holds the arguments and dependencies to create a verifying tortoise instance.
type Config struct {
LayerSize int
Database database.Database
MeshDatabase blockDataProvider
ATXDB atxDataProvider
Clock layerClock
Hdist uint32 // hare lookback distance: the distance over which we use the input vector/hare results
Zdist uint32 // hare result wait distance: the distance over which we're willing to wait for hare results
ConfidenceParam uint32 // confidence wait distance: how long we wait for global consensus to be established
GlobalThreshold uint8 // threshold required to finalize blocks and layers (0-100)
LocalThreshold uint8 // threshold that determines whether a node votes based on local or global opinion (0-100)
WindowSize uint32 // tortoise sliding window: how many layers we store data for
Log log.Log
RerunInterval time.Duration // how often to rerun from genesis
}
// NewVerifyingTortoise creates ThreadSafeVerifyingTortoise instance.
func NewVerifyingTortoise(ctx context.Context, cfg Config) *ThreadSafeVerifyingTortoise {
if cfg.Hdist < cfg.Zdist {
cfg.Log.With().Panic("hdist must be >= zdist", log.Uint32("hdist", cfg.Hdist), log.Uint32("zdist", cfg.Zdist))
}
if cfg.GlobalThreshold > 100 || cfg.LocalThreshold > 100 {
cfg.Log.With().Panic("global and local threshold values must be in the interval [0, 100]")
}
alg := &ThreadSafeVerifyingTortoise{
trtl: newTurtle(
cfg.Log.WithFields(log.String("tortoise_rerun", "false")),
cfg.Database,
cfg.MeshDatabase,
cfg.ATXDB,
cfg.Clock,
cfg.Hdist,
cfg.Zdist,
cfg.ConfidenceParam,
cfg.WindowSize,
cfg.LayerSize,
cfg.GlobalThreshold,
cfg.LocalThreshold,
cfg.RerunInterval,
),
logger: cfg.Log,
lastRerun: time.Now(),
}
if err := alg.trtl.Recover(); err != nil {
if errors.Is(err, database.ErrNotFound) {
alg.trtl.init(ctx, mesh.GenesisLayer())
} else {
cfg.Log.With().Panic("can't recover turtle state", log.Err(err))
}
}
return alg
}
// LatestComplete returns the latest verified layer
func (trtl *ThreadSafeVerifyingTortoise) LatestComplete() types.LayerID {
trtl.mutex.RLock()
verified := trtl.trtl.Verified
trtl.mutex.RUnlock()
return verified
}
// BaseBlock chooses a base block and creates a differences list. needs the hare results for latest layers.
func (trtl *ThreadSafeVerifyingTortoise) BaseBlock(ctx context.Context) (types.BlockID, [][]types.BlockID, error) {
trtl.mutex.Lock()
block, diffs, err := trtl.trtl.BaseBlock(ctx)
trtl.mutex.Unlock()
if err != nil {
return types.BlockID{}, nil, err
}
return block, diffs, err
}
// HandleLateBlocks processes votes and goodness for late blocks (for late block definition see white paper).
// Returns the old verified layer and new verified layer after taking into account the blocks' votes.
func (trtl *ThreadSafeVerifyingTortoise) HandleLateBlocks(ctx context.Context, blocks []*types.Block) (types.LayerID, types.LayerID) {
trtl.mutex.Lock()
defer trtl.mutex.Unlock()
oldVerified := trtl.trtl.Verified
if err := trtl.trtl.ProcessNewBlocks(ctx, blocks); err != nil {
// consider panicking here instead, since it means tortoise is stuck
trtl.logger.WithContext(ctx).With().Error("tortoise errored handling late blocks", log.Err(err))
}
newVerified := trtl.trtl.Verified
return oldVerified, newVerified
}
// HandleIncomingLayer processes all layer block votes
// returns the old verified layer and new verified layer after taking into account the blocks votes
func (trtl *ThreadSafeVerifyingTortoise) HandleIncomingLayer(ctx context.Context, layerID types.LayerID) (oldVerified, newVerified types.LayerID, reverted bool) {
trtl.mutex.Lock()
defer trtl.mutex.Unlock()
oldVerified = trtl.trtl.Verified
// first check if it's time for a total rerun
trtl.logger.With().Debug("checking if tortoise needs to rerun from genesis",
log.Duration("rerun_interval", trtl.trtl.RerunInterval),
log.Time("last_rerun", trtl.lastRerun))
// TODO: in future we can do something more sophisticated, using accounting to determine when enough changes to old
// layers have accumulated (in terms of block weight) that our opinion could actually change. For now, we do the
// Simplest Possible Thing (TM) and just rerun from genesis once in a while. This requires a different instance of
// tortoise since we don't want to mess with the state of the main tortoise. We re-stream layer data from genesis
// using the sliding window, simulating a full resync.
// See https://github.com/spacemeshos/go-spacemesh/issues/2551
if time.Now().Sub(trtl.lastRerun) > trtl.trtl.RerunInterval {
var revertLayer types.LayerID
if reverted, revertLayer = trtl.rerunFromGenesis(ctx); reverted {
// make sure state is reapplied from far enough back if there was a state reversion.
// this is the first changed layer. subtract one to indicate that the layer _prior_ was the old
// pBase, since we never reapply the state of oldPbase.
oldVerified = revertLayer.Sub(1)
}
trtl.lastRerun = time.Now()
}
// Even after a rerun, we still need to process the new incoming layer
trtl.logger.WithContext(ctx).With().Info("handling incoming layer",
log.FieldNamed("old_pbase", oldVerified),
log.FieldNamed("incoming_layer", layerID))
if err := trtl.trtl.HandleIncomingLayer(ctx, layerID); err != nil {
// consider panicking here instead, since it means tortoise is stuck
trtl.logger.WithContext(ctx).With().Error("tortoise errored handling incoming layer", log.Err(err))
}
newVerified = trtl.trtl.Verified
trtl.logger.WithContext(ctx).With().Info("finished handling incoming layer",
log.FieldNamed("old_pbase", oldVerified),
log.FieldNamed("new_pbase", newVerified),
log.FieldNamed("incoming_layer", layerID))
return
}
// this wrapper monitors the tortoise rerun for database changes that would cause us to need to revert state
type bdpWrapper struct {
blockDataProvider
firstUpdatedLayer *types.LayerID
}
// SaveContextualValidity overrides the method in the embedded type to check if we've made changes
func (bdp *bdpWrapper) SaveContextualValidity(bid types.BlockID, lid types.LayerID, validityNew bool) error {
// we only need to know about the first updated layer
if bdp.firstUpdatedLayer == nil {
// first, get current value
validityCur, err := bdp.ContextualValidity(bid)
if err != nil {
return fmt.Errorf("error reading contextual validity of block %v: %w", bid, err)
}
if validityCur != validityNew {
bdp.firstUpdatedLayer = &lid
}
}
return bdp.blockDataProvider.SaveContextualValidity(bid, lid, validityNew)
}
// trigger a rerun from genesis once in a while
func (trtl *ThreadSafeVerifyingTortoise) rerunFromGenesis(ctx context.Context) (reverted bool, revertLayer types.LayerID) {
// TODO: should this happen "in the background" in a separate goroutine? Should it hold the mutex?
logger := trtl.logger.WithContext(ctx)
logger.With().Info("triggering tortoise full rerun from genesis")
// start from scratch with a new tortoise instance for each rerun
trtlForRerun := trtl.trtl.cloneTurtleParams()
trtlForRerun.log = logger.WithFields(log.String("tortoise_rerun", "true"))
trtlForRerun.init(ctx, mesh.GenesisLayer())
bdp := bdpWrapper{blockDataProvider: trtlForRerun.bdp}
trtlForRerun.bdp = &bdp
for layerID := types.GetEffectiveGenesis(); !layerID.After(trtl.trtl.Last); layerID = layerID.Add(1) {
logger.With().Debug("rerunning tortoise for layer", layerID)
if err := trtlForRerun.HandleIncomingLayer(ctx, layerID); err != nil {
logger.With().Error("tortoise rerun errored", log.Err(err))
// bail out completely if we encounter an error: don't revert state and don't swap out the trtl
// TODO: give this some more thought
return
}
}
// revert state if necessary
// state will be reapplied in mesh after we return, no need to reapply here
if bdp.firstUpdatedLayer != nil {
logger.With().Warning("turtle rerun detected state changes, attempting to reapply state from first changed layer",
log.FieldNamed("first_layer", bdp.firstUpdatedLayer))
reverted = true
revertLayer = *bdp.firstUpdatedLayer
}
// swap out the turtle instances so its state is up to date
trtlForRerun.bdp = trtl.trtl.bdp
trtlForRerun.logger = trtl.logger
trtl.trtl = trtlForRerun
return
}
// Persist saves a copy of the current tortoise state to the database
func (trtl *ThreadSafeVerifyingTortoise) Persist(ctx context.Context) error {
trtl.mutex.Lock()
defer trtl.mutex.Unlock()
trtl.logger.WithContext(ctx).Info("persist tortoise")
return trtl.trtl.persist()
}