Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
private,satellite: add chore to dq stray nodes
Full scope: private/testplanet,satellite/{overlay,satellitedb} Description: In most cases, downtime tracking with audits will eventually lead to DQ for nodes who are unresponsive. However, if a stray node has no pieces, it will not be audited and will thus never be disqualified. This chore will check for nodes who have not successfully been contacted in some set time and DQ them. There are some new flags for toggling DQ of stray nodes and the timeframes for running the chore and how long nodes can go without contact. Change-Id: Ic9d41fdbf214736798925e728245180fb3c55615
- Loading branch information
Showing
9 changed files
with
201 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
// Copyright (C) 2020 Storj Labs, Inc. | ||
// See LICENSE for copying information. | ||
|
||
package straynodes | ||
|
||
import ( | ||
"context" | ||
"time" | ||
|
||
"github.com/spacemonkeygo/monkit/v3" | ||
"go.uber.org/zap" | ||
|
||
"storj.io/common/sync2" | ||
"storj.io/storj/satellite/overlay" | ||
) | ||
|
||
var mon = monkit.Package() | ||
|
||
// Config contains configurable values for stray nodes chore. | ||
type Config struct { | ||
EnableDQ bool `help:"whether nodes will be disqualified if they have not been contacted in some time" releaseDefault:"false" devDefault:"true"` | ||
Interval time.Duration `help:"how often to check for and DQ stray nodes" releaseDefault:"168h" devDefault:"5m"` | ||
MaxDurationWithoutContact time.Duration `help:"length of time a node can go without contacting satellite before being disqualified" releaseDefault:"720h" devDefault:"5m"` | ||
} | ||
|
||
// Chore disqualifies stray nodes. | ||
type Chore struct { | ||
log *zap.Logger | ||
cache overlay.DB | ||
maxDurationWithoutContact time.Duration | ||
Loop *sync2.Cycle | ||
} | ||
|
||
// NewChore creates a new stray nodes Chore. | ||
func NewChore(log *zap.Logger, cache overlay.DB, config Config) *Chore { | ||
return &Chore{ | ||
log: log, | ||
cache: cache, | ||
maxDurationWithoutContact: config.MaxDurationWithoutContact, | ||
Loop: sync2.NewCycle(config.Interval), | ||
} | ||
} | ||
|
||
// Run runs the chore. | ||
func (chore *Chore) Run(ctx context.Context) (err error) { | ||
defer mon.Task()(&ctx)(&err) | ||
|
||
return chore.Loop.Run(ctx, func(ctx context.Context) error { | ||
err := chore.cache.DQNodesLastSeenBefore(ctx, time.Now().UTC().Add(-chore.maxDurationWithoutContact)) | ||
if err != nil { | ||
chore.log.Error("error disqualifying stray nodes", zap.Error(err)) | ||
} | ||
return nil | ||
}) | ||
} | ||
|
||
// Close closes chore. | ||
func (chore *Chore) Close() error { | ||
chore.Loop.Close() | ||
return nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
// Copyright (C) 2020 Storj Labs, Inc. | ||
// See LICENSE for copying information. | ||
|
||
package straynodes_test | ||
|
||
import ( | ||
"testing" | ||
"time" | ||
|
||
"github.com/stretchr/testify/require" | ||
"go.uber.org/zap" | ||
|
||
"storj.io/common/pb" | ||
"storj.io/common/testcontext" | ||
"storj.io/storj/private/testplanet" | ||
"storj.io/storj/satellite" | ||
"storj.io/storj/satellite/overlay" | ||
) | ||
|
||
func TestDQStrayNodes(t *testing.T) { | ||
testplanet.Run(t, testplanet.Config{ | ||
SatelliteCount: 1, StorageNodeCount: 2, | ||
Reconfigure: testplanet.Reconfigure{ | ||
Satellite: func(log *zap.Logger, index int, config *satellite.Config) { | ||
config.StrayNodes.MaxDurationWithoutContact = 24 * time.Hour | ||
}, | ||
}, | ||
}, func(t *testing.T, ctx *testcontext.Context, planet *testplanet.Planet) { | ||
strayNode := planet.StorageNodes[0] | ||
liveNode := planet.StorageNodes[1] | ||
sat := planet.Satellites[0] | ||
strayNode.Contact.Chore.Pause(ctx) | ||
sat.Overlay.DQStrayNodes.Loop.Pause() | ||
|
||
cache := planet.Satellites[0].Overlay.DB | ||
|
||
strayInfo, err := cache.Get(ctx, strayNode.ID()) | ||
require.NoError(t, err) | ||
require.Nil(t, strayInfo.Disqualified) | ||
|
||
checkInInfo := overlay.NodeCheckInInfo{ | ||
NodeID: strayNode.ID(), | ||
IsUp: true, | ||
Address: &pb.NodeAddress{ | ||
Address: "1.2.3.4", | ||
}, | ||
Version: &pb.NodeVersion{ | ||
Version: "v0.0.0", | ||
CommitHash: "", | ||
Timestamp: time.Time{}, | ||
Release: false, | ||
}, | ||
} | ||
|
||
// set strayNode last_contact_success to 48 hours ago | ||
require.NoError(t, sat.Overlay.DB.UpdateCheckIn(ctx, checkInInfo, time.Now().Add(-48*time.Hour), sat.Config.Overlay.Node)) | ||
|
||
sat.Overlay.DQStrayNodes.Loop.TriggerWait() | ||
|
||
strayInfo, err = cache.Get(ctx, strayNode.ID()) | ||
require.NoError(t, err) | ||
require.NotNil(t, strayInfo.Disqualified) | ||
|
||
liveInfo, err := cache.Get(ctx, liveNode.ID()) | ||
require.NoError(t, err) | ||
require.Nil(t, liveInfo.Disqualified) | ||
}) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters