Skip to content

Commit

Permalink
fix: improve etcd leave on reset process
Browse files Browse the repository at this point in the history
When removing a member from `etcd`, the server does a pre-check to make
sure the member is connected to a quorum of other members, and the
remove request might fail. Add a retry to wait for the etcd to be fully
connected before giving up, as some parts of the reset flow alrady ran.

Also fix an issue which appears in the integration test, when `reset` is
called early in the boot sequence when local etcd hasn't started fully yet.

Signed-off-by: Andrey Smirnov <andrey.smirnov@talos-systems.com>
  • Loading branch information
smira committed Mar 1, 2023
1 parent 638dc91 commit 40e69af
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 2 deletions.
15 changes: 14 additions & 1 deletion internal/pkg/etcd/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"time"

"github.com/cosi-project/runtime/pkg/state"
"github.com/siderolabs/go-retry/retry"
"go.etcd.io/etcd/api/v3/etcdserverpb"
"go.etcd.io/etcd/api/v3/v3rpc/rpctypes"
"go.etcd.io/etcd/client/pkg/v3/transport"
Expand Down Expand Up @@ -155,7 +156,19 @@ func (c *Client) LeaveCluster(ctx context.Context, st state.State) error {
return err
}

if err := c.RemoveMemberByMemberID(ctx, memberID); err != nil {
if err := retry.Constant(5*time.Minute, retry.WithUnits(10*time.Second)).RetryWithContext(ctx, func(ctx context.Context) error {
err := c.RemoveMemberByMemberID(ctx, memberID)
if err == nil {
return nil
}

if errors.Is(err, rpctypes.ErrUnhealthy) {
// unhealthy is returned when the member hasn't established connections with quorum other members
return retry.ExpectedError(err)
}

return err
}); err != nil {
return err
}

Expand Down
7 changes: 6 additions & 1 deletion internal/pkg/etcd/local.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ package etcd
import (
"context"
"fmt"
"time"

"github.com/cosi-project/runtime/pkg/safe"
"github.com/cosi-project/runtime/pkg/state"
Expand All @@ -16,10 +17,14 @@ import (

// GetLocalMemberID gets the etcd member id of the local node via resources.
func GetLocalMemberID(ctx context.Context, s state.State) (uint64, error) {
member, err := safe.ReaderGet[*etcd.Member](
ctx, cancel := context.WithTimeout(ctx, 3*time.Minute)
defer cancel()

member, err := safe.StateWatchFor[*etcd.Member](
ctx,
s,
etcd.NewMember(etcd.NamespaceName, etcd.LocalMemberID).Metadata(),
state.WithEventTypes(state.Created),
)
if err != nil {
return 0, fmt.Errorf("failed to get local etcd member ID: %w", err)
Expand Down

0 comments on commit 40e69af

Please sign in to comment.