Skip to content

Commit dacbee4

Browse files
committed
fix: improve etcd leave on reset process
When removing a member from `etcd`, the server does a pre-check to make sure the member is connected to a quorum of other members, and the remove request might fail. Add a retry to wait for the etcd to be fully connected before giving up, as some parts of the reset flow alrady ran. Also fix an issue which appears in the integration test, when `reset` is called early in the boot sequence when local etcd hasn't started fully yet. Signed-off-by: Andrey Smirnov <andrey.smirnov@talos-systems.com> (cherry picked from commit 40e69af)
1 parent eee9f5d commit dacbee4

File tree

2 files changed

+20
-2
lines changed

2 files changed

+20
-2
lines changed

internal/pkg/etcd/etcd.go

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"time"
1515

1616
"github.com/cosi-project/runtime/pkg/state"
17+
"github.com/siderolabs/go-retry/retry"
1718
"go.etcd.io/etcd/api/v3/etcdserverpb"
1819
"go.etcd.io/etcd/api/v3/v3rpc/rpctypes"
1920
"go.etcd.io/etcd/client/pkg/v3/transport"
@@ -155,7 +156,19 @@ func (c *Client) LeaveCluster(ctx context.Context, st state.State) error {
155156
return err
156157
}
157158

158-
if err := c.RemoveMemberByMemberID(ctx, memberID); err != nil {
159+
if err := retry.Constant(5*time.Minute, retry.WithUnits(10*time.Second)).RetryWithContext(ctx, func(ctx context.Context) error {
160+
err := c.RemoveMemberByMemberID(ctx, memberID)
161+
if err == nil {
162+
return nil
163+
}
164+
165+
if errors.Is(err, rpctypes.ErrUnhealthy) {
166+
// unhealthy is returned when the member hasn't established connections with quorum other members
167+
return retry.ExpectedError(err)
168+
}
169+
170+
return err
171+
}); err != nil {
159172
return err
160173
}
161174

internal/pkg/etcd/local.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ package etcd
77
import (
88
"context"
99
"fmt"
10+
"time"
1011

1112
"github.com/cosi-project/runtime/pkg/safe"
1213
"github.com/cosi-project/runtime/pkg/state"
@@ -16,10 +17,14 @@ import (
1617

1718
// GetLocalMemberID gets the etcd member id of the local node via resources.
1819
func GetLocalMemberID(ctx context.Context, s state.State) (uint64, error) {
19-
member, err := safe.ReaderGet[*etcd.Member](
20+
ctx, cancel := context.WithTimeout(ctx, 3*time.Minute)
21+
defer cancel()
22+
23+
member, err := safe.StateWatchFor[*etcd.Member](
2024
ctx,
2125
s,
2226
etcd.NewMember(etcd.NamespaceName, etcd.LocalMemberID).Metadata(),
27+
state.WithEventTypes(state.Created),
2328
)
2429
if err != nil {
2530
return 0, fmt.Errorf("failed to get local etcd member ID: %w", err)

0 commit comments

Comments
 (0)