Skip to content

Commit

Permalink
fix: don't allow bootstrap if etcd data directory is not empty
Browse files Browse the repository at this point in the history
If etcd data directory is not empty, it means that `etcd` is already
running (or etcd has run), so removing data directory might lead to data
loss. Under normal circumstances `etcd` is not started before either
node joins existing etcd cluster or it gets bootstrapped.

Most importantly, with this change it's not possible to nuke etcd member
by issuing `talosctl bootstrap` on already running Talos node.

Plus nag user about doing bootstrap if the node is waiting to join.

Signed-off-by: Andrey Smirnov <andrey.smirnov@talos-systems.com>
  • Loading branch information
smira committed Aug 24, 2021
1 parent e24b93b commit 0a6048f
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 38 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,10 @@ func (s *Server) Bootstrap(ctx context.Context, in *machine.BootstrapRequest) (r
return nil, status.Error(codes.FailedPrecondition, "time is not in sync yet")
}

if entries, _ := os.ReadDir(constants.EtcdDataPath); len(entries) > 0 { //nolint:errcheck
return nil, status.Error(codes.AlreadyExists, "etcd data directory is not empty")
}

go func() {
if err := s.Controller.Run(context.Background(), runtime.SequenceBootstrap, in); err != nil {
log.Println("bootstrap failed:", err)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1606,8 +1606,6 @@ func Install(seq runtime.Sequence, data interface{}) (runtime.TaskExecutionFunc,
}

// BootstrapEtcd represents the task for bootstrapping etcd.
//
//nolint:gocyclo
func BootstrapEtcd(seq runtime.Sequence, data interface{}) (runtime.TaskExecutionFunc, string) {
return func(ctx context.Context, logger *log.Logger, r runtime.Runtime) (err error) {
req, ok := data.(*machineapi.BootstrapRequest)
Expand All @@ -1629,41 +1627,8 @@ func BootstrapEtcd(seq runtime.Sequence, data interface{}) (runtime.TaskExecutio
}
}

if err = func() error {
// Since etcd has already attempted to start, we must delete the data. If
// we don't, then an initial cluster state of "new" will fail.
var dir *os.File

dir, err = os.Open(constants.EtcdDataPath)
if err != nil {
if os.IsNotExist(err) {
return nil
}

return err
}

//nolint:errcheck
defer dir.Close()

var files []os.FileInfo

files, err = dir.Readdir(0)
if err != nil {
return err
}

for _, file := range files {
fullPath := filepath.Join(constants.EtcdDataPath, file.Name())

if err = os.RemoveAll(fullPath); err != nil {
return fmt.Errorf("failed to remove %q: %w", file.Name(), err)
}
}

return nil
}(); err != nil {
return err
if entries, _ := os.ReadDir(constants.EtcdDataPath); len(entries) > 0 { //nolint:errcheck
return fmt.Errorf("etcd data directory is not empty")
}

svc := &services.Etcd{
Expand Down
11 changes: 10 additions & 1 deletion internal/app/machined/pkg/system/services/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,10 @@ func addMember(ctx context.Context, r runtime.Runtime, addrs []string, name stri
}

func buildInitialCluster(ctx context.Context, r runtime.Runtime, name, ip string) (initial string, learnerMemberID uint64, err error) {
var id uint64
var (
id uint64
lastNag time.Time
)

err = retry.Constant(10*time.Minute,
retry.WithUnits(3*time.Second),
Expand All @@ -363,6 +366,12 @@ func buildInitialCluster(ctx context.Context, r runtime.Runtime, name, ip string
resp *clientv3.MemberListResponse
)

if time.Since(lastNag) > 30*time.Second {
log.Printf("etcd is waiting to join the cluster, if this node is the first node in the cluster, please run `talosctl bootstrap`")

lastNag = time.Now()
}

attemptCtx, attemptCtxCancel := context.WithTimeout(ctx, 30*time.Second)
defer attemptCtxCancel()

Expand Down

0 comments on commit 0a6048f

Please sign in to comment.