Skip to content

Commit

Permalink
cmd/snap: don't wait for system key when stopping
Browse files Browse the repository at this point in the history
When the system is shutting down waiting for system key is futile. While
we should not have allowed for the core / snapd snaps to be unmounted
before all the services have correctly stopped, fixing that is a longer
journey.

Meanwhile, we can just check if system is shutting down and if so,
ignore any system key mismatches that have occurred.

Fixes: https://bugs.launchpad.net/snapd/+bug/1871652
Signed-off-by: Zygmunt Krynicki <zygmunt.krynicki@canonical.com>
  • Loading branch information
zyga authored and mvo5 committed Apr 9, 2020
1 parent cb67dd2 commit 0ce6726
Show file tree
Hide file tree
Showing 5 changed files with 165 additions and 0 deletions.
34 changes: 34 additions & 0 deletions cmd/snap/cmd_run.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,26 @@ and environment.
}, nil)
}

// isStopping returns true if the system is shutting down.
func isStopping() (bool, error) {
// Make sure, just in case, that systemd doesn't localize the output string.
env, err := osutil.OSEnvironment()
if err != nil {
return false, err
}
env["LC_MESSAGES"] = "C"
// Check if systemd is stopping (shutting down or rebooting).
cmd := exec.Command("systemctl", "is-system-running")
cmd.Env = env.ForExec()
stdout, err := cmd.Output()
// systemctl is-system-running returns non-zero for outcomes other than "running"
// As such, ignore any ExitError and just process the stdout buffer.
if _, ok := err.(*exec.ExitError); ok {
return string(stdout) == "stopping\n", nil
}
return false, err
}

func maybeWaitForSecurityProfileRegeneration(cli *client.Client) error {
// check if the security profiles key has changed, if so, we need
// to wait for snapd to re-generate all profiles
Expand All @@ -125,6 +145,18 @@ func maybeWaitForSecurityProfileRegeneration(cli *client.Client) error {
logger.Debugf("SystemKeyMismatch returned an error: %v", err)
}

// We have a mismatch but maybe it is only because systemd is shutting down
// and core or snapd were already unmounted and we failed to re-execute.
// For context see: https://bugs.launchpad.net/snapd/+bug/1871652
stopping, err := isStopping()
if err != nil {
logger.Debugf("cannot check if system is stopping: %s", err)
}
if stopping {
logger.Debugf("ignoring system key mismatch during system shutdown/reboot")
return nil
}

// We have a mismatch, try to connect to snapd, once we can
// connect we just continue because that usually means that
// a new snapd is ready and has generated profiles.
Expand All @@ -145,6 +177,8 @@ func maybeWaitForSecurityProfileRegeneration(cli *client.Client) error {
}
}

logger.Debugf("system key mismatch detected, waiting for snapd to start responding...")

for i := 0; i < timeout; i++ {
if _, err := cli.SysInfo(); err == nil {
return nil
Expand Down
60 changes: 60 additions & 0 deletions cmd/snap/cmd_run_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1182,3 +1182,63 @@ func (s *RunSuite) TestSnapRunRestoreSecurityContextFail(c *check.C) {
c.Check(verifyCalls, check.Equals, 2)
c.Check(restoreCalls, check.Equals, 1)
}

// systemctl is-system-running returns "running" in normal situations.
func (s *RunSuite) TestIsStoppingRunning(c *check.C) {
systemctl := testutil.MockCommand(c, "systemctl", `
case "$1" in
is-system-running)
echo "running"
exit 0
;;
esac
`)
defer systemctl.Restore()
stop, err := snaprun.IsStopping()
c.Check(err, check.IsNil)
c.Check(stop, check.Equals, false)
c.Check(systemctl.Calls(), check.DeepEquals, [][]string{
{"systemctl", "is-system-running"},
})
}

// systemctl is-system-running returns "stopping" when the system is
// shutting down or rebooting. At the same time it returns a non-zero
// exit status.
func (s *RunSuite) TestIsStoppingStopping(c *check.C) {
systemctl := testutil.MockCommand(c, "systemctl", `
case "$1" in
is-system-running)
echo "stopping"
exit 1
;;
esac
`)
defer systemctl.Restore()
stop, err := snaprun.IsStopping()
c.Check(err, check.IsNil)
c.Check(stop, check.Equals, true)
c.Check(systemctl.Calls(), check.DeepEquals, [][]string{
{"systemctl", "is-system-running"},
})
}

// systemctl is-system-running can often return "degraded"
// Let's make sure that is not confusing us.
func (s *RunSuite) TestIsStoppingDegraded(c *check.C) {
systemctl := testutil.MockCommand(c, "systemctl", `
case "$1" in
is-system-running)
echo "degraded"
exit 1
;;
esac
`)
defer systemctl.Restore()
stop, err := snaprun.IsStopping()
c.Check(err, check.IsNil)
c.Check(stop, check.Equals, false)
c.Check(systemctl.Calls(), check.DeepEquals, [][]string{
{"systemctl", "is-system-running"},
})
}
2 changes: 2 additions & 0 deletions cmd/snap/export_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ var (
SortTimingsTasks = sortTimingsTasks

PrintInstallHint = printInstallHint

IsStopping = isStopping
)

func HiddenCmd(descr string, completeHidden bool) *cmdInfo {
Expand Down
6 changes: 6 additions & 0 deletions tests/regression/lp-1871652/systemctl
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/sh
if [ "${1:-}" = is-system-running ]; then
echo "stopping"
exit 1
fi
exec /bin/systemctl "$@"
63 changes: 63 additions & 0 deletions tests/regression/lp-1871652/task.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
summary: system key mismatch is ignored on reboot/shutdown
# Run on a system matching the guest container.
systems: [ubuntu-18.04-64]
prepare: |
snap install lxd
lxd init --auto
lxc launch ubuntu:18.04 bionic
# Install snapd inside the container and then install the core snap so that
# we get re-execution logic to applies as snapd in the store is more recent
# than snapd in the archive.
lxc exec bionic -- apt-get update
lxc exec bionic -- apt-get install snapd
lxc exec bionic -- systemctl start snapd.socket snapd.service
# XXX: This fails on: error: cannot communicate with server: Post http://localhost/v2/snaps/core: dial unix /run/snapd.socket: connect: no such file or directory
# But why?
retry-tool -n 30 lxc exec bionic -- snap install core
# Install lxd inside the container and activate it (it is socket activated)
# by running a lxc command. When lxd service shuts down systemd runs a
# "snap run ... " command related to it, which will is where the problem
# originates (we will observe a system key mismatch).
lxc exec bionic -- snap install lxd
lxc exec bionic -- lxc info >/dev/null
# Overwrite the snap binary inside the container with the one from this
# system. Given that snapd on the system has just been built from source it
# will have the custom 1337 version string.
# NOTE: This step is only necessary while snapd in the store is older than the fix.
lxc file push /usr/bin/snap bionic/usr/bin/snap
lxc exec bionic -- snap version | MATCH 1337
# Hide real snap binary, ensuring we run the patched one we copied above.
# I considered unmounting the core snap instead but this upsets parts of
# the stack that really want to read information about the current revision
# of the core snap.
lxc exec bionic -- mount --bind /usr/bin/snap /snap/core/current/usr/bin/snap
# Stop and disable snapd.{service,socket} so that successful socket activation is impossible.
lxc exec bionic -- systemctl stop snapd.service
lxc exec bionic -- systemctl stop snapd.socket
# To be *extra sure* that system key mismatch will happen, rewrite the build-id to something else.
lxc exec bionic -- sed -i -E -e 's/"build-id":"[0-9a-f]+"/"build-id":"potato"/' /var/lib/snapd/system-key
lxc exec bionic -- cat /var/lib/snapd/system-key | MATCH potato
# Put our special systemctl and make it executable
lxc file push systemctl bionic/usr/local/bin/systemctl
lxc exec bionic -- chmod +x /usr/local/bin/systemctl
execute: |
# Try to stop LXD in a way that matches what happens when we are shutting
# down. Use the special variable controlling retry count, so that we don't
# have to wait too long.
/usr/bin/time -p -o stop.time lxc exec bionic -- env SNAPD_DEBUG_SYSTEM_KEY_RETRY=2 /usr/bin/snap run --command=stop lxd.daemon || true
# Normally, with the bug present, this hangs for at least 10 seconds and
# then fail, with the fix it should be under one second but we allow for up
# to five. The comparison is performed in milliseconds.
test "$(LC_NUMERIC=C awk '/real/ { print($2 * 1000); exit(0);}' < stop.time)" -lt 5000
restore: |
lxc exec bionic -- chmod -x /usr/local/bin/systemctl
lxc stop --force bionic
snap remove --purge lxd
lxd-tool undo-lxd-mount-changes

0 comments on commit 0ce6726

Please sign in to comment.