Skip to content

Commit 6b7c611

Browse files
committed
fix: stabilize etcd join and promote sequences
There were two issues with using discovery service for join and promote: * on join, that resulted in joining too fast which triggers race bugs in etcd cert generation (to be fixed as separate PR) * on promote, Talos has to connect to non-learner member of the cluster which is somehow "automatic" with Kuberentes discovery, as it only lists `kube-apiserver` running which is up only when etcd on the same node is healthy. etcd client doesn't allow to avoid learner members, as even getting a member list from a learner doesn't work (to be fixed as a separate PR) Signed-off-by: Andrey Smirnov <andrey.smirnov@talos-systems.com> (cherry picked from commit 8a038d4)
1 parent 6dbc086 commit 6b7c611

File tree

2 files changed

+23
-3
lines changed

2 files changed

+23
-3
lines changed

internal/app/machined/pkg/system/services/etcd.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -337,7 +337,7 @@ func addMember(ctx context.Context, r runtime.Runtime, addrs []string, name stri
337337
return nil, 0, fmt.Errorf("failed to generate etcd PKI: %w", err)
338338
}
339339

340-
client, err := etcd.NewClientFromControlPlaneIPs(ctx, r.State().V1Alpha2().Resources())
340+
client, err := etcd.NewClientFromControlPlaneIPsNoDiscovery(ctx, r.State().V1Alpha2().Resources())
341341
if err != nil {
342342
return nil, 0, err
343343
}
@@ -681,11 +681,11 @@ func promoteMember(ctx context.Context, r runtime.Runtime, memberID uint64) erro
681681
// try to promote a member until it succeeds (call might fail until the member catches up with the leader)
682682
// promote member call will fail until member catches up with the master
683683
return retry.Constant(10*time.Minute,
684-
retry.WithUnits(10*time.Second),
684+
retry.WithUnits(15*time.Second),
685685
retry.WithJitter(time.Second),
686686
retry.WithErrorLogging(true),
687687
).RetryWithContext(ctx, func(ctx context.Context) error {
688-
client, err := etcd.NewClientFromControlPlaneIPs(ctx, r.State().V1Alpha2().Resources())
688+
client, err := etcd.NewClientFromControlPlaneIPsNoDiscovery(ctx, r.State().V1Alpha2().Resources())
689689
if err != nil {
690690
return retry.ExpectedError(err)
691691
}

internal/pkg/etcd/etcd.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"errors"
1010
"fmt"
1111
"log"
12+
"math/rand"
1213
"os"
1314
"time"
1415

@@ -73,6 +74,18 @@ func NewLocalClient() (client *Client, err error) {
7374
// NewClientFromControlPlaneIPs initializes and returns an etcd client
7475
// configured to talk to all members.
7576
func NewClientFromControlPlaneIPs(ctx context.Context, resources state.State) (client *Client, err error) {
77+
return newClientFromControlPlaneIPs(ctx, resources, "")
78+
}
79+
80+
// NewClientFromControlPlaneIPsNoDiscovery initializes and returns an etcd client
81+
// configured to talk to all members, but ignores discovery service data.
82+
//
83+
// Note: this is a temporary workaround for etcd join issues which will be removed once backported to Talos 1.1.x.
84+
func NewClientFromControlPlaneIPsNoDiscovery(ctx context.Context, resources state.State) (client *Client, err error) {
85+
return newClientFromControlPlaneIPs(ctx, resources, k8s.ControlPlaneDiscoveredEndpointsID)
86+
}
87+
88+
func newClientFromControlPlaneIPs(ctx context.Context, resources state.State, ignoreEndpointID string) (client *Client, err error) {
7689
endpointResources, err := resources.List(ctx, resource.NewMetadata(k8s.ControlPlaneNamespaceName, k8s.EndpointType, "", resource.VersionUndefined))
7790
if err != nil {
7891
return nil, fmt.Errorf("error getting endpoints resources: %w", err)
@@ -82,6 +95,10 @@ func NewClientFromControlPlaneIPs(ctx context.Context, resources state.State) (c
8295

8396
// merge all endpoints into a single list
8497
for _, res := range endpointResources.Items {
98+
if res.Metadata().ID() == ignoreEndpointID {
99+
continue
100+
}
101+
85102
endpointAddrs = endpointAddrs.Merge(res.(*k8s.Endpoint))
86103
}
87104

@@ -96,6 +113,9 @@ func NewClientFromControlPlaneIPs(ctx context.Context, resources state.State) (c
96113
endpoints[i] = net.FormatAddress(endpoints[i]) + ":2379"
97114
}
98115

116+
// Shuffle endpoints to establish random order on each call to avoid patterns based on sorted IP list.
117+
rand.Shuffle(len(endpoints), func(i, j int) { endpoints[i], endpoints[j] = endpoints[j], endpoints[i] })
118+
99119
return NewClient(endpoints)
100120
}
101121

0 commit comments

Comments
 (0)