Skip to content

Commit eefe1c2

Browse files
committed
feat: add new etcd members in learner mode
Fixes #3714 This provides more safe way to join new members to the etcd cluster. See https://etcd.io/docs/v3.4/learning/design-learner/ With learner mode join there are few differences: * new nodes are joined one by one, because etcd enforces a single learner member in the cluster * learner members are not counted in quorum calculations, so while learner catches up with the master node, quorum is not affected and cluster is still operational Signed-off-by: Andrey Smirnov <andrey.smirnov@talos-systems.com>
1 parent b1c66fb commit eefe1c2

File tree

8 files changed

+123
-19
lines changed

8 files changed

+123
-19
lines changed

api/machine/machine.proto

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -791,6 +791,8 @@ message EtcdMember {
791791
repeated string peer_urls = 4;
792792
// the list of URLs the member exposes to the cluster for communication.
793793
repeated string client_urls = 5;
794+
// learner flag
795+
bool is_learner = 6;
794796
}
795797

796798
// EtcdMembers contains the list of members registered on the host.

cmd/talosctl/cmd/talos/etcd.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ var etcdMemberListCmd = &cobra.Command{
8989

9090
w := tabwriter.NewWriter(os.Stdout, 0, 0, 3, ' ', 0)
9191
node := ""
92-
pattern := "%s\t%s\t%s\t%s\n"
92+
pattern := "%s\t%s\t%s\t%s\t%v\n"
9393

9494
for i, message := range response.Messages {
9595
if message.Metadata != nil && message.Metadata.Hostname != "" {
@@ -103,10 +103,10 @@ var etcdMemberListCmd = &cobra.Command{
103103
for j, member := range message.Members {
104104
if i == 0 && j == 0 {
105105
if node != "" {
106-
fmt.Fprintln(w, "NODE\tID\tHOSTNAME\tPEER URLS\tCLIENT URLS")
106+
fmt.Fprintln(w, "NODE\tID\tHOSTNAME\tPEER URLS\tCLIENT URLS\tLEARNER")
107107
pattern = "%s\t" + pattern
108108
} else {
109-
fmt.Fprintln(w, "ID\tHOSTNAME\tPEER URLS\tCLIENT URLS")
109+
fmt.Fprintln(w, "ID\tHOSTNAME\tPEER URLS\tCLIENT URLS\tLEARNER")
110110
}
111111
}
112112

@@ -115,6 +115,7 @@ var etcdMemberListCmd = &cobra.Command{
115115
member.Hostname,
116116
strings.Join(member.PeerUrls, ","),
117117
strings.Join(member.ClientUrls, ","),
118+
member.IsLearner,
118119
}
119120
if node != "" {
120121
args = append([]interface{}{node}, args...)

hack/release.toml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,13 @@ the default values overwritten by Talos.
6161
* runc: 1.0.1
6262
* GRUB: 2.06
6363
* Talos is built with Go 1.16.6
64+
"""
65+
66+
[notes.etcd]
67+
title = "etcd"
68+
description = """\
69+
New etcd cluster members are now joined in [learner mode](https://etcd.io/docs/v3.4/learning/design-learner/), which improves cluster resiliency
70+
to member join issues.
6471
"""
6572

6673
[notes.capi]

internal/app/machined/internal/server/v1alpha1/v1alpha1_server.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1712,6 +1712,7 @@ func (s *Server) EtcdMemberList(ctx context.Context, in *machine.EtcdMemberListR
17121712
Hostname: member.GetName(),
17131713
PeerUrls: member.GetPeerURLs(),
17141714
ClientUrls: member.GetClientURLs(),
1715+
IsLearner: member.GetIsLearner(),
17151716
},
17161717
)
17171718

internal/app/machined/pkg/system/services/etcd.go

Lines changed: 55 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,11 @@ type Etcd struct {
5858

5959
args []string
6060
client *etcd.Client
61+
62+
// if the new member was added as a learner during the service start, its ID is kept here
63+
learnerMemberID uint64
64+
65+
promoteCtxCancel context.CancelFunc
6166
}
6267

6368
// ID implements the Service interface.
@@ -95,6 +100,9 @@ func (e *Etcd) PreFunc(ctx context.Context, r runtime.Runtime) (err error) {
95100
return fmt.Errorf("failed to pull image %q: %w", r.Config().Cluster().Etcd().Image(), err)
96101
}
97102

103+
// Clear any previously set learner member ID
104+
e.learnerMemberID = 0
105+
98106
switch t := r.Config().Machine().Type(); t {
99107
case machine.TypeInit:
100108
return e.argsForInit(ctx, r)
@@ -111,6 +119,10 @@ func (e *Etcd) PreFunc(ctx context.Context, r runtime.Runtime) (err error) {
111119

112120
// PostFunc implements the Service interface.
113121
func (e *Etcd) PostFunc(r runtime.Runtime, state events.ServiceState) (err error) {
122+
if e.promoteCtxCancel != nil {
123+
e.promoteCtxCancel()
124+
}
125+
114126
if e.client != nil {
115127
e.client.Close() //nolint:errcheck
116128
}
@@ -157,6 +169,20 @@ func (e *Etcd) Runner(r runtime.Runtime) (runner.Runner, error) {
157169

158170
env = append(env, "ETCD_CIPHER_SUITES=TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305") //nolint:lll
159171

172+
if e.learnerMemberID != 0 {
173+
var promoteCtx context.Context
174+
175+
promoteCtx, e.promoteCtxCancel = context.WithCancel(context.Background())
176+
177+
go func() {
178+
if err := promoteMember(promoteCtx, r, e.learnerMemberID); err != nil && !errors.Is(err, context.Canceled) {
179+
log.Printf("failed promoting member: %s", err)
180+
} else if err == nil {
181+
log.Printf("successfully promoted etcd member")
182+
}
183+
}()
184+
}
185+
160186
return restart.New(containerd.NewRunner(
161187
r.Config().Debug(),
162188
&args,
@@ -304,7 +330,7 @@ func addMember(ctx context.Context, r runtime.Runtime, addrs []string, name stri
304330
}
305331
}
306332

307-
add, err := client.MemberAdd(ctx, addrs)
333+
add, err := client.MemberAddAsLearner(ctx, addrs)
308334
if err != nil {
309335
return nil, 0, fmt.Errorf("error adding member: %w", err)
310336
}
@@ -317,7 +343,9 @@ func addMember(ctx context.Context, r runtime.Runtime, addrs []string, name stri
317343
return list, add.Member.ID, nil
318344
}
319345

320-
func buildInitialCluster(ctx context.Context, r runtime.Runtime, name, ip string) (initial string, err error) {
346+
func buildInitialCluster(ctx context.Context, r runtime.Runtime, name, ip string) (initial string, learnerMemberID uint64, err error) {
347+
var id uint64
348+
321349
err = retry.Constant(10*time.Minute,
322350
retry.WithUnits(3*time.Second),
323351
retry.WithJitter(time.Second),
@@ -326,7 +354,6 @@ func buildInitialCluster(ctx context.Context, r runtime.Runtime, name, ip string
326354
var (
327355
peerAddrs = []string{"https://" + net.FormatAddress(ip) + ":2380"}
328356
resp *clientv3.MemberListResponse
329-
id uint64
330357
)
331358

332359
attemptCtx, attemptCtxCancel := context.WithTimeout(ctx, 30*time.Second)
@@ -362,10 +389,10 @@ func buildInitialCluster(ctx context.Context, r runtime.Runtime, name, ip string
362389
})
363390

364391
if err != nil {
365-
return "", fmt.Errorf("failed to build cluster arguments: %w", err)
392+
return "", 0, fmt.Errorf("failed to build cluster arguments: %w", err)
366393
}
367394

368-
return initial, nil
395+
return initial, id, nil
369396
}
370397

371398
//nolint:gocyclo
@@ -441,7 +468,7 @@ func (e *Etcd) argsForInit(ctx context.Context, r runtime.Runtime) error {
441468
if upgraded {
442469
denyListArgs.Set("initial-cluster-state", "existing")
443470

444-
initialCluster, err = buildInitialCluster(ctx, r, hostname, primaryAddr)
471+
initialCluster, e.learnerMemberID, err = buildInitialCluster(ctx, r, hostname, primaryAddr)
445472
if err != nil {
446473
return err
447474
}
@@ -534,7 +561,7 @@ func (e *Etcd) argsForControlPlane(ctx context.Context, r runtime.Runtime) error
534561
if e.Bootstrap {
535562
initialCluster = fmt.Sprintf("%s=https://%s:2380", hostname, net.FormatAddress(primaryAddr))
536563
} else {
537-
initialCluster, err = buildInitialCluster(ctx, r, hostname, primaryAddr)
564+
initialCluster, e.learnerMemberID, err = buildInitialCluster(ctx, r, hostname, primaryAddr)
538565
if err != nil {
539566
return fmt.Errorf("failed to build initial etcd cluster: %w", err)
540567
}
@@ -591,6 +618,27 @@ func (e *Etcd) recoverFromSnapshot(hostname, primaryAddr string) error {
591618
return nil
592619
}
593620

621+
func promoteMember(ctx context.Context, r runtime.Runtime, memberID uint64) error {
622+
// try to promote a member until it succeeds (call might fail until the member catches up with the leader)
623+
// promote member call will fail until member catches up with the master
624+
return retry.Constant(10*time.Minute,
625+
retry.WithUnits(10*time.Second),
626+
retry.WithJitter(time.Second),
627+
retry.WithErrorLogging(true),
628+
).RetryWithContext(ctx, func(ctx context.Context) error {
629+
client, err := etcd.NewClientFromControlPlaneIPs(ctx, r.Config().Cluster().CA(), r.Config().Cluster().Endpoint())
630+
if err != nil {
631+
return retry.ExpectedError(err)
632+
}
633+
634+
defer client.Close() //nolint:errcheck
635+
636+
_, err = client.MemberPromote(ctx, memberID)
637+
638+
return retry.ExpectedError(err)
639+
})
640+
}
641+
594642
// IsDirEmpty checks if a directory is empty or not.
595643
func IsDirEmpty(name string) (bool, error) {
596644
f, err := os.Open(name)

pkg/machinery/api/machine/machine.pb.go

Lines changed: 20 additions & 9 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/machinery/api/machine/machine_vtproto.pb.go

Lines changed: 33 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

website/content/docs/v0.12/Reference/api.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1023,6 +1023,7 @@ EtcdMember describes a single etcd member.
10231023
| hostname | [string](#string) | | human-readable name of the member. |
10241024
| peer_urls | [string](#string) | repeated | the list of URLs the member exposes to clients for communication. |
10251025
| client_urls | [string](#string) | repeated | the list of URLs the member exposes to the cluster for communication. |
1026+
| is_learner | [bool](#bool) | | learner flag |
10261027

10271028

10281029

0 commit comments

Comments
 (0)