Skip to content

Commit

Permalink
feat: implement etcd maintenance commands
Browse files Browse the repository at this point in the history
This allows to safely recover out of space quota issues, and perform
degragmentation as needed.

`talosctl etcd status` command provides lots of information about the
cluster health.

See docs for more details.

Fixes #4889

Signed-off-by: Andrey Smirnov <andrey.smirnov@talos-systems.com>
  • Loading branch information
smira committed Jan 3, 2023
1 parent 80fed31 commit 96629d5
Show file tree
Hide file tree
Showing 14 changed files with 12,973 additions and 8,989 deletions.
77 changes: 77 additions & 0 deletions api/machine/machine.proto
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,25 @@ service MachineService {
//
// This method is available only on control plane nodes (which run etcd).
rpc EtcdSnapshot(EtcdSnapshotRequest) returns (stream common.Data);
// EtcdAlarmList lists etcd alarms for the current node.
//
// This method is available only on control plane nodes (which run etcd).
rpc EtcdAlarmList(google.protobuf.Empty) returns (EtcdAlarmListResponse);
// EtcdAlarmDisarm disarms etcd alarms for the current node.
//
// This method is available only on control plane nodes (which run etcd).
rpc EtcdAlarmDisarm(google.protobuf.Empty) returns (EtcdAlarmDisarmResponse);
// EtcdDefragment defragments etcd data directory for the current node.
//
// Defragmentation is a resource-heavy operation, so it should only run on a specific
// node.
//
// This method is available only on control plane nodes (which run etcd).
rpc EtcdDefragment(google.protobuf.Empty) returns (EtcdDefragmentResponse);
// EtcdStatus returns etcd status for the current member.
//
// This method is available only on control plane nodes (which run etcd).
rpc EtcdStatus(google.protobuf.Empty) returns (EtcdStatusResponse);
rpc GenerateConfiguration(GenerateConfigurationRequest) returns (GenerateConfigurationResponse);
rpc Hostname(google.protobuf.Empty) returns (HostnameResponse);
rpc Kubeconfig(google.protobuf.Empty) returns (stream common.Data);
Expand Down Expand Up @@ -982,6 +1001,64 @@ message EtcdRecoverResponse {
repeated EtcdRecover messages = 1;
}

message EtcdAlarmListResponse {
repeated EtcdAlarm messages = 1;
}

message EtcdAlarm {
common.Metadata metadata = 1;
repeated EtcdMemberAlarm member_alarms = 2;
}

message EtcdMemberAlarm {
enum AlarmType {
NONE = 0;
NOSPACE = 1;
CORRUPT = 2;
}
uint64 member_id = 1;
AlarmType alarm = 2;
}

message EtcdAlarmDisarmResponse {
repeated EtcdAlarmDisarm messages = 1;
}

message EtcdAlarmDisarm {
common.Metadata metadata = 1;
repeated EtcdMemberAlarm member_alarms = 2;
}

message EtcdDefragmentResponse {
repeated EtcdDefragment messages = 1;
}

message EtcdDefragment {
common.Metadata metadata = 1;
}

message EtcdStatusResponse {
repeated EtcdStatus messages = 1;
}

message EtcdStatus {
common.Metadata metadata = 1;
EtcdMemberStatus member_status = 2;
}

message EtcdMemberStatus {
uint64 member_id = 10;
string protocol_version = 1;
int64 db_size = 2;
int64 db_size_in_use = 3;
uint64 leader = 4;
uint64 raft_index = 5;
uint64 raft_term = 6;
uint64 raft_applied_index = 7;
repeated string errors = 8;
bool is_learner = 9;
}

// rpc generateConfiguration

message RouteConfig {
Expand Down
198 changes: 197 additions & 1 deletion cmd/talosctl/cmd/talos/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,16 @@ import (
"sync"
"text/tabwriter"

"github.com/dustin/go-humanize"
"github.com/siderolabs/gen/slices"
"github.com/spf13/cobra"
snapshot "go.etcd.io/etcd/etcdutl/v3/snapshot"
"google.golang.org/grpc/codes"

"github.com/siderolabs/talos/cmd/talosctl/pkg/talos/helpers"
"github.com/siderolabs/talos/pkg/cli"
"github.com/siderolabs/talos/pkg/logging"
"github.com/siderolabs/talos/pkg/machinery/api/common"
"github.com/siderolabs/talos/pkg/machinery/api/machine"
"github.com/siderolabs/talos/pkg/machinery/client"
etcdresource "github.com/siderolabs/talos/pkg/machinery/resources/etcd"
Expand All @@ -33,12 +36,127 @@ var etcdCmd = &cobra.Command{
Long: ``,
}

// etcdAlarmCmd represents the etcd alarm command.
var etcdAlarmCmd = &cobra.Command{
Use: "alarm",
Short: "Manage etcd alarms",
Long: ``,
}

type alarmMessage interface {
GetMetadata() *common.Metadata
GetMemberAlarms() []*machine.EtcdMemberAlarm
}

func displayAlarms(messages []alarmMessage) error {
w := tabwriter.NewWriter(os.Stdout, 0, 0, 3, ' ', 0)
node := ""
pattern := "%s\t%s\n"
header := "MEMBER\tALARM"

for i, message := range messages {
if message.GetMetadata() != nil && message.GetMetadata().GetHostname() != "" {
node = message.GetMetadata().GetHostname()
}

for j, alarm := range message.GetMemberAlarms() {
if i == 0 && j == 0 {
if node != "" {
header = "NODE\t" + header
pattern = "%s\t" + pattern
}

fmt.Fprintln(w, header)
}

args := []interface{}{
etcdresource.FormatMemberID(alarm.GetMemberId()),
alarm.GetAlarm().String(),
}
if node != "" {
args = append([]interface{}{node}, args...)
}

fmt.Fprintf(w, pattern, args...)
}
}

return w.Flush()
}

// etcdAlarmListCmd represents the etcd alarm list command.
var etcdAlarmListCmd = &cobra.Command{
Use: "list",
Short: "List the etcd alarms for the node.",
Long: ``,
RunE: func(cmd *cobra.Command, args []string) error {
return WithClient(func(ctx context.Context, c *client.Client) error {
response, err := c.EtcdAlarmList(ctx)
if err != nil {
if response == nil {
return fmt.Errorf("error getting alarms: %w", err)
}
cli.Warning("%s", err)
}

return displayAlarms(slices.Map(response.Messages, func(v *machine.EtcdAlarm) alarmMessage {
return v
}))
})
},
}

// etcdAlarmDisarmCmd represents the etcd alarm disarm command.
var etcdAlarmDisarmCmd = &cobra.Command{
Use: "disarm",
Short: "Disarm the etcd alarms for the node.",
Long: ``,
RunE: func(cmd *cobra.Command, args []string) error {
return WithClient(func(ctx context.Context, c *client.Client) error {
response, err := c.EtcdAlarmDisarm(ctx)
if err != nil {
if response == nil {
return fmt.Errorf("error disarming alarms: %w", err)
}
cli.Warning("%s", err)
}

return displayAlarms(slices.Map(response.Messages, func(v *machine.EtcdAlarmDisarm) alarmMessage {
return v
}))
})
},
}

// etcdDefragCmd represents the etcd defrag command.
var etcdDefragCmd = &cobra.Command{
Use: "defrag",
Short: "Defragment etcd database on the node",
Long: `Defragmentation is a maintenance operation that releases unused space from the etcd database file.
Defragmentation is a resource heavy operation and should be performed only when necessary on a single node at a time.`,
RunE: func(cmd *cobra.Command, args []string) error {
return WithClient(func(ctx context.Context, c *client.Client) error {
if err := helpers.FailIfMultiNodes(ctx, "etcd defrag"); err != nil {
return err
}

_, err := c.EtcdDefragment(ctx)

return err
})
},
}

var etcdLeaveCmd = &cobra.Command{
Use: "leave",
Short: "Tell nodes to leave etcd cluster",
Long: ``,
RunE: func(cmd *cobra.Command, args []string) error {
return WithClient(func(ctx context.Context, c *client.Client) error {
if err := helpers.FailIfMultiNodes(ctx, "etcd leave"); err != nil {
return err
}

return c.EtcdLeaveCluster(ctx, &machine.EtcdLeaveClusterRequest{})
})
},
Expand Down Expand Up @@ -146,6 +264,69 @@ var etcdMemberListCmd = &cobra.Command{
},
}

var etcdStatusCmd = &cobra.Command{
Use: "status",
Short: "Get the status of etcd cluster member",
Long: `Returns the status of etcd member on the node, use multiple nodes to get status of all members.`,
RunE: func(cmd *cobra.Command, args []string) error {
return WithClient(func(ctx context.Context, c *client.Client) error {
response, err := c.EtcdStatus(ctx)
if err != nil {
if response == nil {
return fmt.Errorf("error getting status: %w", err)
}
cli.Warning("%s", err)
}

w := tabwriter.NewWriter(os.Stdout, 0, 0, 3, ' ', 0)
node := ""
pattern := "%s\t%s\t%s (%.2f%%)\t%s\t%d\t%d\t%d\t%v\t%s\n"
header := "MEMBER\tDB SIZE\tIN USE\tLEADER\tRAFT INDEX\tRAFT TERM\tRAFT APPLIED INDEX\tLEARNER\tERRORS"

for i, message := range response.Messages {
if message.Metadata != nil && message.Metadata.Hostname != "" {
node = message.Metadata.Hostname
}

if i == 0 {
if node != "" {
header = "NODE\t" + header
pattern = "%s\t" + pattern
}

fmt.Fprintln(w, header)
}

var ratio float64

if message.GetMemberStatus().GetDbSize() > 0 {
ratio = float64(message.GetMemberStatus().GetDbSizeInUse()) / float64(message.GetMemberStatus().GetDbSize()) * 100.0
}

args := []interface{}{
etcdresource.FormatMemberID(message.GetMemberStatus().GetMemberId()),
humanize.Bytes(uint64(message.GetMemberStatus().GetDbSize())),
humanize.Bytes(uint64(message.GetMemberStatus().GetDbSizeInUse())),
ratio,
etcdresource.FormatMemberID(message.GetMemberStatus().GetLeader()),
message.GetMemberStatus().GetRaftIndex(),
message.GetMemberStatus().GetRaftTerm(),
message.GetMemberStatus().GetRaftAppliedIndex(),
message.GetMemberStatus().GetIsLearner(),
strings.Join(message.GetMemberStatus().GetErrors(), ", "),
}
if node != "" {
args = append([]interface{}{node}, args...)
}

fmt.Fprintf(w, pattern, args...)
}

return w.Flush()
})
},
}

var etcdSnapshotCmd = &cobra.Command{
Use: "snapshot <path>",
Short: "Stream snapshot of the etcd node to the path.",
Expand Down Expand Up @@ -228,6 +409,21 @@ var etcdSnapshotCmd = &cobra.Command{
}

func init() {
etcdCmd.AddCommand(etcdLeaveCmd, etcdForfeitLeadershipCmd, etcdMemberListCmd, etcdMemberRemoveCmd, etcdSnapshotCmd)
etcdAlarmCmd.AddCommand(
etcdAlarmListCmd,
etcdAlarmDisarmCmd,
)

etcdCmd.AddCommand(
etcdAlarmCmd,
etcdDefragCmd,
etcdForfeitLeadershipCmd,
etcdLeaveCmd,
etcdMemberListCmd,
etcdMemberRemoveCmd,
etcdSnapshotCmd,
etcdStatusCmd,
)

addCommand(etcdCmd)
}
13 changes: 13 additions & 0 deletions hack/release.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,19 @@ preface = """\
Talos is built with Go 1.19.4.
"""

[notes.etcd]
title = "etcd Maintenance"
description="""\
Talos adds new APIs to make it easier to perform etcd maintenance operations.
These APIs are available via new `talosctl etcd` sub-commands:
* `talosctl etcd alarm list|disarm`
* `talosctl etcd defrag`
* `talosctl etcd status`
See also [etcd maintenance guide](https://talos.dev/v1.4/advanced/etcd-maintenance/).
"""

[make_deps]

Expand Down
Loading

0 comments on commit 96629d5

Please sign in to comment.