Skip to content

Commit

Permalink
feat: add new etcd members in learner mode
Browse files Browse the repository at this point in the history
Fixes #3714

This provides more safe way to join new members to the etcd cluster.

See https://etcd.io/docs/v3.4/learning/design-learner/

With learner mode join there are few differences:

* new nodes are joined one by one, because etcd enforces a single
learner member in the cluster
* learner members are not counted in quorum calculations, so while
learner catches up with the master node, quorum is not affected and
cluster is still operational

Signed-off-by: Andrey Smirnov <andrey.smirnov@talos-systems.com>
  • Loading branch information
smira committed Aug 12, 2021
1 parent b1c66fb commit eefe1c2
Show file tree
Hide file tree
Showing 8 changed files with 123 additions and 19 deletions.
2 changes: 2 additions & 0 deletions api/machine/machine.proto
Expand Up @@ -791,6 +791,8 @@ message EtcdMember {
repeated string peer_urls = 4;
// the list of URLs the member exposes to the cluster for communication.
repeated string client_urls = 5;
// learner flag
bool is_learner = 6;
}

// EtcdMembers contains the list of members registered on the host.
Expand Down
7 changes: 4 additions & 3 deletions cmd/talosctl/cmd/talos/etcd.go
Expand Up @@ -89,7 +89,7 @@ var etcdMemberListCmd = &cobra.Command{

w := tabwriter.NewWriter(os.Stdout, 0, 0, 3, ' ', 0)
node := ""
pattern := "%s\t%s\t%s\t%s\n"
pattern := "%s\t%s\t%s\t%s\t%v\n"

for i, message := range response.Messages {
if message.Metadata != nil && message.Metadata.Hostname != "" {
Expand All @@ -103,10 +103,10 @@ var etcdMemberListCmd = &cobra.Command{
for j, member := range message.Members {
if i == 0 && j == 0 {
if node != "" {
fmt.Fprintln(w, "NODE\tID\tHOSTNAME\tPEER URLS\tCLIENT URLS")
fmt.Fprintln(w, "NODE\tID\tHOSTNAME\tPEER URLS\tCLIENT URLS\tLEARNER")
pattern = "%s\t" + pattern
} else {
fmt.Fprintln(w, "ID\tHOSTNAME\tPEER URLS\tCLIENT URLS")
fmt.Fprintln(w, "ID\tHOSTNAME\tPEER URLS\tCLIENT URLS\tLEARNER")
}
}

Expand All @@ -115,6 +115,7 @@ var etcdMemberListCmd = &cobra.Command{
member.Hostname,
strings.Join(member.PeerUrls, ","),
strings.Join(member.ClientUrls, ","),
member.IsLearner,
}
if node != "" {
args = append([]interface{}{node}, args...)
Expand Down
7 changes: 7 additions & 0 deletions hack/release.toml
Expand Up @@ -61,6 +61,13 @@ the default values overwritten by Talos.
* runc: 1.0.1
* GRUB: 2.06
* Talos is built with Go 1.16.6
"""

[notes.etcd]
title = "etcd"
description = """\
New etcd cluster members are now joined in [learner mode](https://etcd.io/docs/v3.4/learning/design-learner/), which improves cluster resiliency
to member join issues.
"""

[notes.capi]
Expand Down
Expand Up @@ -1712,6 +1712,7 @@ func (s *Server) EtcdMemberList(ctx context.Context, in *machine.EtcdMemberListR
Hostname: member.GetName(),
PeerUrls: member.GetPeerURLs(),
ClientUrls: member.GetClientURLs(),
IsLearner: member.GetIsLearner(),
},
)

Expand Down
62 changes: 55 additions & 7 deletions internal/app/machined/pkg/system/services/etcd.go
Expand Up @@ -58,6 +58,11 @@ type Etcd struct {

args []string
client *etcd.Client

// if the new member was added as a learner during the service start, its ID is kept here
learnerMemberID uint64

promoteCtxCancel context.CancelFunc
}

// ID implements the Service interface.
Expand Down Expand Up @@ -95,6 +100,9 @@ func (e *Etcd) PreFunc(ctx context.Context, r runtime.Runtime) (err error) {
return fmt.Errorf("failed to pull image %q: %w", r.Config().Cluster().Etcd().Image(), err)
}

// Clear any previously set learner member ID
e.learnerMemberID = 0

switch t := r.Config().Machine().Type(); t {
case machine.TypeInit:
return e.argsForInit(ctx, r)
Expand All @@ -111,6 +119,10 @@ func (e *Etcd) PreFunc(ctx context.Context, r runtime.Runtime) (err error) {

// PostFunc implements the Service interface.
func (e *Etcd) PostFunc(r runtime.Runtime, state events.ServiceState) (err error) {
if e.promoteCtxCancel != nil {
e.promoteCtxCancel()
}

if e.client != nil {
e.client.Close() //nolint:errcheck
}
Expand Down Expand Up @@ -157,6 +169,20 @@ func (e *Etcd) Runner(r runtime.Runtime) (runner.Runner, error) {

env = append(env, "ETCD_CIPHER_SUITES=TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305") //nolint:lll

if e.learnerMemberID != 0 {
var promoteCtx context.Context

promoteCtx, e.promoteCtxCancel = context.WithCancel(context.Background())

go func() {
if err := promoteMember(promoteCtx, r, e.learnerMemberID); err != nil && !errors.Is(err, context.Canceled) {
log.Printf("failed promoting member: %s", err)
} else if err == nil {
log.Printf("successfully promoted etcd member")
}
}()
}

return restart.New(containerd.NewRunner(
r.Config().Debug(),
&args,
Expand Down Expand Up @@ -304,7 +330,7 @@ func addMember(ctx context.Context, r runtime.Runtime, addrs []string, name stri
}
}

add, err := client.MemberAdd(ctx, addrs)
add, err := client.MemberAddAsLearner(ctx, addrs)
if err != nil {
return nil, 0, fmt.Errorf("error adding member: %w", err)
}
Expand All @@ -317,7 +343,9 @@ func addMember(ctx context.Context, r runtime.Runtime, addrs []string, name stri
return list, add.Member.ID, nil
}

func buildInitialCluster(ctx context.Context, r runtime.Runtime, name, ip string) (initial string, err error) {
func buildInitialCluster(ctx context.Context, r runtime.Runtime, name, ip string) (initial string, learnerMemberID uint64, err error) {
var id uint64

err = retry.Constant(10*time.Minute,
retry.WithUnits(3*time.Second),
retry.WithJitter(time.Second),
Expand All @@ -326,7 +354,6 @@ func buildInitialCluster(ctx context.Context, r runtime.Runtime, name, ip string
var (
peerAddrs = []string{"https://" + net.FormatAddress(ip) + ":2380"}
resp *clientv3.MemberListResponse
id uint64
)

attemptCtx, attemptCtxCancel := context.WithTimeout(ctx, 30*time.Second)
Expand Down Expand Up @@ -362,10 +389,10 @@ func buildInitialCluster(ctx context.Context, r runtime.Runtime, name, ip string
})

if err != nil {
return "", fmt.Errorf("failed to build cluster arguments: %w", err)
return "", 0, fmt.Errorf("failed to build cluster arguments: %w", err)
}

return initial, nil
return initial, id, nil
}

//nolint:gocyclo
Expand Down Expand Up @@ -441,7 +468,7 @@ func (e *Etcd) argsForInit(ctx context.Context, r runtime.Runtime) error {
if upgraded {
denyListArgs.Set("initial-cluster-state", "existing")

initialCluster, err = buildInitialCluster(ctx, r, hostname, primaryAddr)
initialCluster, e.learnerMemberID, err = buildInitialCluster(ctx, r, hostname, primaryAddr)
if err != nil {
return err
}
Expand Down Expand Up @@ -534,7 +561,7 @@ func (e *Etcd) argsForControlPlane(ctx context.Context, r runtime.Runtime) error
if e.Bootstrap {
initialCluster = fmt.Sprintf("%s=https://%s:2380", hostname, net.FormatAddress(primaryAddr))
} else {
initialCluster, err = buildInitialCluster(ctx, r, hostname, primaryAddr)
initialCluster, e.learnerMemberID, err = buildInitialCluster(ctx, r, hostname, primaryAddr)
if err != nil {
return fmt.Errorf("failed to build initial etcd cluster: %w", err)
}
Expand Down Expand Up @@ -591,6 +618,27 @@ func (e *Etcd) recoverFromSnapshot(hostname, primaryAddr string) error {
return nil
}

func promoteMember(ctx context.Context, r runtime.Runtime, memberID uint64) error {
// try to promote a member until it succeeds (call might fail until the member catches up with the leader)
// promote member call will fail until member catches up with the master
return retry.Constant(10*time.Minute,
retry.WithUnits(10*time.Second),
retry.WithJitter(time.Second),
retry.WithErrorLogging(true),
).RetryWithContext(ctx, func(ctx context.Context) error {
client, err := etcd.NewClientFromControlPlaneIPs(ctx, r.Config().Cluster().CA(), r.Config().Cluster().Endpoint())
if err != nil {
return retry.ExpectedError(err)
}

defer client.Close() //nolint:errcheck

_, err = client.MemberPromote(ctx, memberID)

return retry.ExpectedError(err)
})
}

// IsDirEmpty checks if a directory is empty or not.
func IsDirEmpty(name string) (bool, error) {
f, err := os.Open(name)
Expand Down
29 changes: 20 additions & 9 deletions pkg/machinery/api/machine/machine.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

33 changes: 33 additions & 0 deletions pkg/machinery/api/machine/machine_vtproto.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions website/content/docs/v0.12/Reference/api.md
Expand Up @@ -1023,6 +1023,7 @@ EtcdMember describes a single etcd member.
| hostname | [string](#string) | | human-readable name of the member. |
| peer_urls | [string](#string) | repeated | the list of URLs the member exposes to clients for communication. |
| client_urls | [string](#string) | repeated | the list of URLs the member exposes to the cluster for communication. |
| is_learner | [bool](#bool) | | learner flag |



Expand Down

0 comments on commit eefe1c2

Please sign in to comment.