Skip to content

Commit

Permalink
fix: store and execute desired action on emergency action
Browse files Browse the repository at this point in the history
Fixes #7854

Talos runs an emergency handler if the sequence experience and
unrecoverable failure. The emergency handler was unconditionally
executing "reboot" action if no other action was received (which only
gets received if the sequence completes successfully), so the Shutdown
request might result in a Reboot behavior on error during shutdown
phase.

This is not a pretty fix, but it's hard to deliver the intent from one
part of the code to another right now, so instead use a global variable
which stores default emergency intention, and gets overridden early in
the Shutdown sequence.

Signed-off-by: Andrey Smirnov <andrey.smirnov@siderolabs.com>
  • Loading branch information
smira committed Dec 4, 2023
1 parent 515ae2a commit 474fa04
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 2 deletions.
3 changes: 2 additions & 1 deletion internal/app/machined/main.go
Expand Up @@ -27,6 +27,7 @@ import (
"github.com/siderolabs/talos/internal/app/apid"
"github.com/siderolabs/talos/internal/app/dashboard"
"github.com/siderolabs/talos/internal/app/machined/pkg/runtime"
"github.com/siderolabs/talos/internal/app/machined/pkg/runtime/emergency"
v1alpha1runtime "github.com/siderolabs/talos/internal/app/machined/pkg/runtime/v1alpha1"
"github.com/siderolabs/talos/internal/app/machined/pkg/system"
"github.com/siderolabs/talos/internal/app/machined/pkg/system/services"
Expand Down Expand Up @@ -95,7 +96,7 @@ func syncNonVolatileStorageBuffers() {

//nolint:gocyclo
func handle(ctx context.Context, err error) {
rebootCmd := unix.LINUX_REBOOT_CMD_RESTART
rebootCmd := int(emergency.RebootCmd.Load())

var rebootErr runtime.RebootError

Expand Down
Expand Up @@ -534,6 +534,13 @@ func (ctrl *PlatformConfigController) runWithRestarts(ctx context.Context, logge
return
}

// skip restarting if context is already done
select {
case <-ctx.Done():
return
default:
}

interval := backoff.NextBackOff()

logger.Error("restarting platform network config", zap.Duration("interval", interval), zap.Error(err))
Expand Down
19 changes: 19 additions & 0 deletions internal/app/machined/pkg/runtime/emergency/emergency.go
@@ -0,0 +1,19 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

// Package emergency provides values to handle emergency (panic/unrecoverable error) handling for machined.
package emergency

import (
"sync/atomic"

"golang.org/x/sys/unix"
)

// RebootCmd is a command to reboot the system after an unrecoverable error.
var RebootCmd atomic.Int64

func init() {
RebootCmd.Store(unix.LINUX_REBOOT_CMD_RESTART)
}
Expand Up @@ -375,7 +375,10 @@ func (*Sequencer) Reset(r runtime.Runtime, in runtime.ResetOptions) []runtime.Ph

// Shutdown is the shutdown sequence.
func (*Sequencer) Shutdown(r runtime.Runtime, in *machineapi.ShutdownRequest) []runtime.Phase {
phases := PhaseList{}.AppendWhen(
phases := PhaseList{}.Append(
"storeShudown",
StoreShutdownEmergency,
).AppendWhen(
!in.GetForce() && !r.Config().Machine().Kubelet().SkipNodeRegistration(),
"drain",
CordonAndDrainNode,
Expand Down
Expand Up @@ -47,6 +47,7 @@ import (
installer "github.com/siderolabs/talos/cmd/installer/pkg/install"
"github.com/siderolabs/talos/internal/app/machined/pkg/runtime"
"github.com/siderolabs/talos/internal/app/machined/pkg/runtime/disk"
"github.com/siderolabs/talos/internal/app/machined/pkg/runtime/emergency"
"github.com/siderolabs/talos/internal/app/machined/pkg/runtime/v1alpha1/bootloader/grub"
"github.com/siderolabs/talos/internal/app/machined/pkg/runtime/v1alpha1/platform"
"github.com/siderolabs/talos/internal/app/machined/pkg/system"
Expand Down Expand Up @@ -2255,6 +2256,17 @@ func FlushMeta(runtime.Sequence, any) (runtime.TaskExecutionFunc, string) {
}, "flushMeta"
}

// StoreShutdownEmergency stores shutdown emergency state.
func StoreShutdownEmergency(runtime.Sequence, any) (runtime.TaskExecutionFunc, string) {
return func(ctx context.Context, logger *log.Logger, r runtime.Runtime) error {
// for shutdown sequence, store power_off as the intent, it will be picked up
// by emergency handled in machined/main.go if the Shutdown sequence fails
emergency.RebootCmd.Store(unix.LINUX_REBOOT_CMD_POWER_OFF)

return nil
}, "storeShutdownEmergency"
}

func pauseOnFailure(callback func(runtime.Sequence, any) (runtime.TaskExecutionFunc, string),
timeout time.Duration,
) func(seq runtime.Sequence, data any) (runtime.TaskExecutionFunc, string) {
Expand Down

0 comments on commit 474fa04

Please sign in to comment.