Skip to content

Commit

Permalink
fix: service restart (including extension services)
Browse files Browse the repository at this point in the history
Fixes #6707

There was a race condition between different parts of the service code:
`Stop` waits for the event which is published before the service is
removed from the `running[id]` map, so if one does `Stop` followed by
`Start` (this is what `services restart` API does), by the time it goes
to `Start` it might be still in the `running[id]` map, so `Start` does
nothing.

Overall this code should be rewritten and simplified, but for now move
out sending these "terminal" events out so that by the time the event is
published, the service is stopped and removed from the `running[id]`
map.

Signed-off-by: Andrey Smirnov <andrey.smirnov@talos-systems.com>
  • Loading branch information
smira committed Jan 18, 2023
1 parent 680fd5e commit 18122ae
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 85 deletions.
34 changes: 16 additions & 18 deletions internal/app/machined/pkg/system/service_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package system

import (
"context"
"errors"
"fmt"
"log"
"sync"
Expand Down Expand Up @@ -182,12 +183,17 @@ func (svcrunner *ServiceRunner) waitFor(ctx context.Context, condition condition
}
}

// Start initializes the service and runs it
// ErrSkip is returned by Run when service is skipped.
var ErrSkip = errors.New("service skipped")

// Run initializes the service and runs it.
//
// Run returns an error when a service stops.
//
// Start should be run in a goroutine.
// Run should be run in a goroutine.
//
//nolint:gocyclo
func (svcrunner *ServiceRunner) Start() {
func (svcrunner *ServiceRunner) Run() error {
defer func() {
// reset context for the next run
svcrunner.ctxMu.Lock()
Expand Down Expand Up @@ -215,27 +221,21 @@ func (svcrunner *ServiceRunner) Start() {

if condition != nil {
if err := svcrunner.waitFor(ctx, condition); err != nil {
svcrunner.UpdateState(ctx, events.StateFailed, "Condition failed: %v", err)

return
return fmt.Errorf("condition failed: %w", err)
}
}

svcrunner.UpdateState(ctx, events.StatePreparing, "Running pre state")

if err := svcrunner.service.PreFunc(ctx, svcrunner.runtime); err != nil {
svcrunner.UpdateState(ctx, events.StateFailed, "Failed to run pre stage: %v", err)

return
return fmt.Errorf("failed to run pre stage: %w", err)
}

svcrunner.UpdateState(ctx, events.StatePreparing, "Creating service runner")

runnr, err := svcrunner.service.Runner(svcrunner.runtime)
if err != nil {
svcrunner.UpdateState(ctx, events.StateFailed, "Failed to create runner: %v", err)

return
return fmt.Errorf("failed to create runner: %w", err)
}

defer func() {
Expand All @@ -248,16 +248,14 @@ func (svcrunner *ServiceRunner) Start() {
}()

if runnr == nil {
svcrunner.UpdateState(ctx, events.StateSkipped, "Service skipped")

return
return ErrSkip
}

if err := svcrunner.run(ctx, runnr); err != nil {
svcrunner.UpdateState(ctx, events.StateFailed, "Failed running service: %v", err)
} else {
svcrunner.UpdateState(ctx, events.StateFinished, "Service finished successfully")
return fmt.Errorf("failed running service: %w", err)
}

return nil
}

//nolint:gocyclo
Expand Down

0 comments on commit 18122ae

Please sign in to comment.