diff --git a/docs/snap-confine.rst b/docs/snap-confine.rst index 6495e33..89be31d 100644 --- a/docs/snap-confine.rst +++ b/docs/snap-confine.rst @@ -85,6 +85,13 @@ quirks: the execution environment. This allows various snaps, while running in devmode, to access the LXD socket. LP: #1613845 +Sharing of the mount namespace +------------------------------ + +As of version 1.0.41 all the applications from the same snap will share the +same mount namespace. Applications from different snaps continue to use +separate mount namespaces. + ENVIRONMENT =========== @@ -126,6 +133,41 @@ FILES Description of the seccomp profile. +`/run/snapd/ns/`: + + Directory used to keep shared mount namespaces. + + `snap-confine` internally converts this directory to a private bind mount. + Semantically the behavior is identical to the following mount commands: + + mount --bind /run/snapd/ns /run/snapd/ns + mount --make-private /run/snapd/ns + +`/run/snapd/ns/.lock`: + + A `flock(2)`-based lock file acquired to create and convert + `/run/snapd/ns/` to a private bind mount. + +`/run/snapd/ns/$SNAP_NAME.lock`: + + A `flock(2)`-based lock file acquired to create or join the mount namespace + represented as `/run/snaps/ns/$SNAP_NAME.mnt`. + +`/run/snapd/ns/$SNAP_NAME.mnt`: + + This file can be either: + + - An empty file that may be seen before the mount namespace is preserved or + when the mount namespace is unmounted. + - A file belonging to the `nsfs` file system, representing a fully + populated mount namespace of a given snap. The file is bind mounted from + `/proc/self/ns/mnt` from the first process in any snap. + +`/proc/self/mountinfo`: + + This file is read to decide if `/run/snapd/ns/` needs to be created and + converted to a private bind mount, as described above. + Note that the apparmor profile is external to `snap-confine` and is loaded directly into the kernel. The actual apparmor profile is managed by `snapd`. diff --git a/src/Makefile.am b/src/Makefile.am index a032c4a..021eaeb 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -25,7 +25,9 @@ snap_confine_SOURCES = \ quirks.c \ quirks.h \ mountinfo.c \ - mountinfo.h + mountinfo.h \ + ns-support.c \ + ns-support.h snap_confine_CFLAGS = -Wall -Werror $(AM_CFLAGS) snap_confine_LDFLAGS = $(AM_LDFLAGS) @@ -66,7 +68,8 @@ snap_confine_unit_tests_SOURCES = \ cleanup-funcs-test.c \ mount-support-test.c \ verify-executable-name-test.c \ - mountinfo-test.c + mountinfo-test.c \ + ns-support-test.c snap_confine_unit_tests_CFLAGS = $(snap_confine_CFLAGS) $(GLIB_CFLAGS) snap_confine_unit_tests_LDADD = $(snap_confine_LDADD) $(GLIB_LIBS) snap_confine_unit_tests_LDFLAGS = $(snap_confine_LDFLAGS) diff --git a/src/cleanup-funcs.c b/src/cleanup-funcs.c index c34ed53..9b5cead 100644 --- a/src/cleanup-funcs.c +++ b/src/cleanup-funcs.c @@ -18,6 +18,7 @@ #include "cleanup-funcs.h" #include +#include void sc_cleanup_string(char **ptr) { @@ -49,3 +50,8 @@ void sc_cleanup_closedir(DIR ** ptr) closedir(*ptr); } } + +void sc_cleanup_close(int *ptr) +{ + close(*ptr); +} diff --git a/src/cleanup-funcs.h b/src/cleanup-funcs.h index d8977ba..ca0d37c 100644 --- a/src/cleanup-funcs.h +++ b/src/cleanup-funcs.h @@ -72,4 +72,12 @@ void sc_cleanup_seccomp_release(scmp_filter_ctx * ptr); **/ void sc_cleanup_closedir(DIR ** ptr); +/** + * Close an open file descriptor with close(2) + * + * This function is designed to be used with + * __attribute__((cleanup(sc_cleanup_close))). + **/ +void sc_cleanup_close(int *ptr); + #endif diff --git a/src/mountinfo.c b/src/mountinfo.c index 82d042e..9a49688 100644 --- a/src/mountinfo.c +++ b/src/mountinfo.c @@ -235,6 +235,8 @@ static struct mountinfo_entry *parse_mountinfo_entry(const char *line) if ((entry->mount_opts = parse_next_string_field()) == NULL) goto fail; entry->optional_fields = &entry->line_buf[0] + total_used++; + // NOTE: This ensures that optional_fields is never NULL. If this changes, + // must adjust all callers of parse_mountinfo_entry() accordingly. strcpy(entry->optional_fields, ""); for (;;) { char *opt_field = parse_next_string_field(); diff --git a/src/ns-support-test.c b/src/ns-support-test.c new file mode 100644 index 0000000..1c320aa --- /dev/null +++ b/src/ns-support-test.c @@ -0,0 +1,342 @@ +/* + * Copyright (C) 2016 Canonical Ltd + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 3 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + */ + +#include "ns-support.h" +#include "ns-support.c" + +#include "cleanup-funcs.h" + +#include +#include // for NSFS_MAGIC +#include + +#include +#include + +// Set alternate namespace directory +static void sc_set_ns_dir(const char *dir) +{ + sc_ns_dir = dir; +} + +// Shell-out to "rm -rf -- $dir" as long as $dir is in /tmp. +static void rm_rf_tmp(const char *dir) +{ + // Sanity check, don't remove anything that's not in the temporary + // directory. This is here to prevent unintended data loss. + if (!g_str_has_prefix(dir, "/tmp/")) + die("refusing to remove: %s", dir); + const gchar *working_directory = NULL; + gchar **argv = NULL; + gchar **envp = NULL; + GSpawnFlags flags = G_SPAWN_SEARCH_PATH; + GSpawnChildSetupFunc child_setup = NULL; + gpointer user_data = NULL; + gchar **standard_output = NULL; + gchar **standard_error = NULL; + gint exit_status = 0; + GError *error = NULL; + + argv = calloc(5, sizeof *argv); + if (argv == NULL) + die("cannot allocate command argument array"); + argv[0] = g_strdup("rm"); + if (argv[0] == NULL) + die("cannot allocate memory"); + argv[1] = g_strdup("-rf"); + if (argv[1] == NULL) + die("cannot allocate memory"); + argv[2] = g_strdup("--"); + if (argv[2] == NULL) + die("cannot allocate memory"); + argv[3] = g_strdup(dir); + if (argv[3] == NULL) + die("cannot allocate memory"); + argv[4] = NULL; + g_assert_true(g_spawn_sync + (working_directory, argv, envp, flags, child_setup, + user_data, standard_output, standard_error, &exit_status, + &error)); + g_assert_true(g_spawn_check_exit_status(exit_status, NULL)); + if (error != NULL) { + g_test_message("cannot remove temporary directory: %s\n", + error->message); + g_error_free(error); + } + g_free(argv[0]); + g_free(argv[1]); + g_free(argv[2]); + g_free(argv[3]); + g_free(argv); +} + +// Check that rm_rf_tmp doesn't remove things outside of /tmp +static void test_rm_rf_tmp() +{ + if (access("/nonexistent", F_OK) == 0) { + g_test_message + ("/nonexistent exists but this test doesn't want it to"); + g_test_fail(); + return; + } + if (g_test_subprocess()) { + rm_rf_tmp("/nonexistent"); + return; + } + g_test_trap_subprocess(NULL, 0, 0); + g_test_trap_assert_failed(); +} + +// Use temporary directory for namespace groups. +// +// The directory is automatically reset to the real value at the end of the +// test. +static const char *sc_test_use_fake_ns_dir() +{ + char *ns_dir = NULL; + if (g_test_subprocess()) { + // Check if the environment variable is set. If so then someone is already + // managing the temporary directory and we should not create a new one. + ns_dir = getenv("SNAP_CONFINE_NS_DIR"); + g_assert_nonnull(ns_dir); + } else { + ns_dir = g_dir_make_tmp(NULL, NULL); + g_assert_nonnull(ns_dir); + g_test_queue_free(ns_dir); + g_assert_cmpint(setenv("SNAP_CONFINE_NS_DIR", ns_dir, 0), ==, + 0); + g_test_queue_destroy((GDestroyNotify) unsetenv, + "SNAP_CONFINE_NS_DIR"); + g_test_queue_destroy((GDestroyNotify) rm_rf_tmp, ns_dir); + } + g_test_queue_destroy((GDestroyNotify) sc_set_ns_dir, SC_NS_DIR); + sc_set_ns_dir(ns_dir); + return ns_dir; +} + +// Check that allocating a namespace group sets up internal data structures to +// safe values. +static void test_sc_alloc_ns_group() +{ + struct sc_ns_group *group = NULL; + group = sc_alloc_ns_group(); + g_test_queue_free(group); + g_assert_nonnull(group); + g_assert_cmpint(group->dir_fd, ==, -1); + g_assert_cmpint(group->lock_fd, ==, -1); + g_assert_cmpint(group->event_fd, ==, -1); + g_assert_cmpint(group->child, ==, 0); + g_assert_cmpint(group->should_populate, ==, false); + g_assert_null(group->name); +} + +// Initialize a namespace group. +// +// The group is automatically destroyed at the end of the test. +static struct sc_ns_group *sc_test_open_ns_group(const char *group_name) +{ + // Initialize a namespace group + struct sc_ns_group *group = NULL; + if (group_name == NULL) { + group_name = "test-group"; + } + group = sc_open_ns_group(group_name); + g_test_queue_destroy((GDestroyNotify) sc_close_ns_group, group); + // Check if the returned group data looks okay + g_assert_nonnull(group); + g_assert_cmpint(group->dir_fd, !=, -1); + g_assert_cmpint(group->lock_fd, !=, -1); + g_assert_cmpint(group->event_fd, ==, -1); + g_assert_cmpint(group->child, ==, 0); + g_assert_cmpint(group->should_populate, ==, false); + g_assert_cmpstr(group->name, ==, group_name); + return group; +} + +// Check that initializing a namespace group creates the appropriate +// filesystem structure and obtains open file descriptors for the lock. +static void test_sc_open_ns_group() +{ + const char *ns_dir = sc_test_use_fake_ns_dir(); + struct sc_ns_group *group = sc_test_open_ns_group(NULL); + // Check that the group directory exists + g_assert_true(g_file_test + (ns_dir, G_FILE_TEST_EXISTS | G_FILE_TEST_IS_DIR)); + // Check that the lock file exists + char *lock_file __attribute__ ((cleanup(sc_cleanup_string))) = NULL; + lock_file = + g_strdup_printf("%s/%s%s", ns_dir, group->name, SC_NS_LOCK_FILE); + g_assert_true(g_file_test + (lock_file, G_FILE_TEST_EXISTS | G_FILE_TEST_IS_REGULAR)); +} + +static void test_sc_lock_ns_mutex_precondition() +{ + sc_test_use_fake_ns_dir(); + if (g_test_subprocess()) { + struct sc_ns_group *group = sc_alloc_ns_group(); + g_test_queue_free(group); + // Try to lock the mutex, this should abort because we never opened the + // lock file and don't have a valid file descriptor. + sc_lock_ns_mutex(group); + return; + } + g_test_trap_subprocess(NULL, 0, 0); + g_test_trap_assert_failed(); +} + +static void test_sc_unlock_ns_mutex_precondition() +{ + sc_test_use_fake_ns_dir(); + if (g_test_subprocess()) { + struct sc_ns_group *group = sc_alloc_ns_group(); + g_test_queue_free(group); + // Try to unlock the mutex, this should abort because we never opened the + // lock file and don't have a valid file descriptor. + sc_unlock_ns_mutex(group); + return; + } + g_test_trap_subprocess(NULL, 0, 0); + g_test_trap_assert_failed(); +} + +// Check that locking a namespace actually flock's the mutex with LOCK_EX +static void test_sc_lock_unlock_ns_mutex() +{ + const char *ns_dir = sc_test_use_fake_ns_dir(); + struct sc_ns_group *group = sc_test_open_ns_group(NULL); + // Lock the namespace group mutex + sc_lock_ns_mutex(group); + // Construct the name of the lock file + char *lock_file __attribute__ ((cleanup(sc_cleanup_string))) = NULL; + lock_file = + g_strdup_printf("%s/%s%s", ns_dir, group->name, SC_NS_LOCK_FILE); + // Open the lock file again to obtain a separate file descriptor. + // According to flock(2) locks are associated with an open file table entry + // so this descriptor will be separate and can compete for the same lock. + int lock_fd __attribute__ ((cleanup(sc_cleanup_close))) = -1; + lock_fd = open(lock_file, O_RDWR | O_CLOEXEC | O_NOFOLLOW); + g_assert_cmpint(lock_fd, !=, -1); + // The non-blocking lock operation should fail with EWOULDBLOCK as the lock + // file is locked by sc_nlock_ns_mutex() already. + int err = flock(lock_fd, LOCK_EX | LOCK_NB); + int saved_errno = errno; + g_assert_cmpint(err, ==, -1); + g_assert_cmpint(saved_errno, ==, EWOULDBLOCK); + // Unlock the namespace group mutex + sc_unlock_ns_mutex(group); + // Re-attempt the locking operation. This time it should succeed. + err = flock(lock_fd, LOCK_EX | LOCK_NB); + g_assert_cmpint(err, ==, 0); +} + +static void unmount_dir(void *dir) +{ + umount(dir); +} + +static void test_sc_is_ns_group_dir_private() +{ + if (geteuid() != 0) { + g_test_skip("this test needs to run as root"); + return; + } + const char *ns_dir = sc_test_use_fake_ns_dir(); + g_test_queue_destroy(unmount_dir, (char *)ns_dir); + + if (g_test_subprocess()) { + // The temporary directory should not be private initially + g_assert_false(sc_is_ns_group_dir_private()); + + /// do what "mount --bind /foo /foo; mount --make-private /foo" does. + int err; + err = mount(ns_dir, ns_dir, NULL, MS_BIND, NULL); + g_assert_cmpint(err, ==, 0); + err = mount(NULL, ns_dir, NULL, MS_PRIVATE, NULL); + g_assert_cmpint(err, ==, 0); + + // The temporary directory should now be private + g_assert_true(sc_is_ns_group_dir_private()); + return; + } + g_test_trap_subprocess(NULL, 0, G_TEST_SUBPROCESS_INHERIT_STDERR); + g_test_trap_assert_passed(); +} + +static void test_sc_initialize_ns_groups() +{ + if (geteuid() != 0) { + g_test_skip("this test needs to run as root"); + return; + } + // NOTE: this is g_test_subprocess aware! + const char *ns_dir = sc_test_use_fake_ns_dir(); + g_test_queue_destroy(unmount_dir, (char *)ns_dir); + if (g_test_subprocess()) { + // Initialize namespace groups using a fake directory. + sc_initialize_ns_groups(); + + // Check that the fake directory is now a private mount. + g_assert_true(sc_is_ns_group_dir_private()); + + // Check that the lock file did not leak unclosed. + + // Construct the name of the lock file + char *lock_file __attribute__ ((cleanup(sc_cleanup_string))) = + NULL; + lock_file = + g_strdup_printf("%s/%s", sc_ns_dir, SC_NS_LOCK_FILE); + // Attempt to open and lock the lock file. + int lock_fd __attribute__ ((cleanup(sc_cleanup_close))) = -1; + lock_fd = open(lock_file, O_RDWR | O_CLOEXEC | O_NOFOLLOW); + g_assert_cmpint(lock_fd, !=, -1); + // The non-blocking lock operation should not fail + int err = flock(lock_fd, LOCK_EX | LOCK_NB); + g_assert_cmpint(err, ==, 0); + return; + } + g_test_trap_subprocess(NULL, 0, G_TEST_SUBPROCESS_INHERIT_STDERR); + g_test_trap_assert_passed(); +} + +// Sanity check, ensure that the namespace filesystem identifier is what we +// expect, aka NSFS_MAGIC. +static void test_nsfs_fs_id() +{ + struct statfs buf; + int err = statfs("/proc/self/ns/mnt", &buf); + g_assert_cmpint(err, ==, 0); + g_assert_cmpint(buf.f_type, ==, NSFS_MAGIC); +} + +static void __attribute__ ((constructor)) init() +{ + g_test_add_func("/internal/rm_rf_tmp", test_rm_rf_tmp); + g_test_add_func("/ns/sc_alloc_ns_group", test_sc_alloc_ns_group); + g_test_add_func("/ns/sc_init_ns_group", test_sc_open_ns_group); + g_test_add_func("/ns/sc_lock_unlock_ns_mutex", + test_sc_lock_unlock_ns_mutex); + g_test_add_func("/ns/sc_lock_ns_mutex/precondition", + test_sc_lock_ns_mutex_precondition); + g_test_add_func("/ns/sc_unlock_ns_mutex/precondition", + test_sc_unlock_ns_mutex_precondition); + g_test_add_func("/ns/nsfs_fs_id", test_nsfs_fs_id); + g_test_add_func("/system/ns/sc_is_ns_group_dir_private", + test_sc_is_ns_group_dir_private); + g_test_add_func("/system/ns/sc_initialize_ns_groups", + test_sc_initialize_ns_groups); +} diff --git a/src/ns-support.c b/src/ns-support.c new file mode 100644 index 0000000..50b77f2 --- /dev/null +++ b/src/ns-support.c @@ -0,0 +1,422 @@ +/* + * Copyright (C) 2016 Canonical Ltd + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 3 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + */ + +#include "ns-support.h" + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_APPARMOR +#include +#endif // ifdef HAVE_APPARMOR + +#include "utils.h" +#include "user-support.h" +#include "mountinfo.h" +#include "cleanup-funcs.h" + +/** + * Directory where snap-confine keeps namespace files. + **/ +#define SC_NS_DIR "/run/snapd/ns" + +/** + * Effective value of SC_NS_DIR. + * + * We use 'const char *' so we can update sc_ns_dir in the testsuite + **/ +static const char *sc_ns_dir = SC_NS_DIR; + +/** + * Name of the lock file associated with SC_NS_DIR. + * and a given group identifier (typically SNAP_NAME). + **/ +#define SC_NS_LOCK_FILE ".lock" + +/** + * Name of the preserved mount namespace associated with SC_NS_DIR + * and a given group identifier (typically SNAP_NAME). + **/ +#define SC_NS_MNT_FILE ".mnt" + +/** + * Read /proc/self/mountinfo and check if /run/snapd/ns is a private bind mount. + * + * We do this because /run/snapd/ns cannot be shared with any other peers as per: + * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt + **/ +static bool sc_is_ns_group_dir_private() +{ + struct mountinfo *info + __attribute__ ((cleanup(cleanup_mountinfo))) = NULL; + info = parse_mountinfo(NULL); + if (info == NULL) { + die("cannot parse /proc/self/mountinfo"); + } + struct mountinfo_entry *entry = first_mountinfo_entry(info); + while (entry != NULL) { + const char *mount_dir = mountinfo_entry_mount_dir(entry); + const char *optional_fields = + mountinfo_entry_optional_fields(entry); + if (strcmp(mount_dir, sc_ns_dir) == 0 + && strcmp(optional_fields, "") == 0) { + // If /run/snapd/ns has no optional fields, we know it is mounted + // private and there is nothing else to do. + return true; + } + entry = next_mountinfo_entry(entry); + } + return false; +} + +void sc_initialize_ns_groups() +{ + debug("creating namespace group directory %s", sc_ns_dir); + mkpath(sc_ns_dir); + debug("opening namespace group directory %s", sc_ns_dir); + int dir_fd __attribute__ ((cleanup(sc_cleanup_close))) = -1; + dir_fd = open(sc_ns_dir, O_DIRECTORY | O_PATH | O_CLOEXEC | O_NOFOLLOW); + if (dir_fd < 0) { + die("cannot open namespace group directory"); + } + debug("opening lock file for group directory"); + int lock_fd __attribute__ ((cleanup(sc_cleanup_close))) = -1; + lock_fd = openat(dir_fd, + SC_NS_LOCK_FILE, + O_CREAT | O_RDWR | O_CLOEXEC | O_NOFOLLOW, 0600); + if (lock_fd < 0) { + die("cannot open lock file for namespace group directory"); + } + debug("locking the namespace group directory"); + if (flock(lock_fd, LOCK_EX) < 0) { + die("cannot acquire exclusive lock for namespace group directory"); + } + if (!sc_is_ns_group_dir_private()) { + debug + ("bind mounting the namespace group directory over itself"); + if (mount(sc_ns_dir, sc_ns_dir, NULL, MS_BIND | MS_REC, NULL) < + 0) { + die("cannot bind mount namespace group directory over itself"); + } + debug + ("making the namespace group directory mount point private"); + if (mount(NULL, sc_ns_dir, NULL, MS_PRIVATE, NULL) < 0) { + die("cannot make the namespace group directory mount point private"); + } + } else { + debug + ("namespace group directory does not require intialization"); + } + debug("unlocking the namespace group directory"); + if (flock(lock_fd, LOCK_UN) < 0) { + die("cannot release lock for namespace control directory"); + } +} + +struct sc_ns_group { + // Name of the namespace group ($SNAP_NAME). + char *name; + // Descriptor to the namespace group control directory. This descriptor is + // opened with O_PATH|O_DIRECTORY so it's only used for openat() calls. + int dir_fd; + // Descriptor to a namespace-specific lock file (i.e. $SNAP_NAME.lock). + int lock_fd; + // Descriptor to an eventfd that is used to notify the child that it can + // now complete its job and exit. + int event_fd; + // Identifier of the child process that is used during the one-time (per + // group) initialization and capture process. + pid_t child; + // Flag set when this process created a fresh namespace should populate it. + bool should_populate; +}; + +static struct sc_ns_group *sc_alloc_ns_group() +{ + struct sc_ns_group *group = calloc(1, sizeof *group); + if (group == NULL) { + die("cannot allocate memory for namespace group"); + } + group->dir_fd = -1; + group->lock_fd = -1; + group->event_fd = -1; + // Redundant with calloc but some functions check for the non-zero value so + // I'd like to keep this explicit in the code. + group->child = 0; + return group; +} + +struct sc_ns_group *sc_open_ns_group(const char *group_name) +{ + struct sc_ns_group *group = sc_alloc_ns_group(); + debug("opening namespace group directory %s", sc_ns_dir); + group->dir_fd = + open(sc_ns_dir, O_DIRECTORY | O_PATH | O_CLOEXEC | O_NOFOLLOW); + if (group->dir_fd < 0) { + die("cannot open directory for namespace group %s", group_name); + } + char lock_fname[PATH_MAX]; + must_snprintf(lock_fname, sizeof lock_fname, "%s%s", group_name, + SC_NS_LOCK_FILE); + debug("opening lock file for namespace group %s", group_name); + group->lock_fd = + openat(group->dir_fd, lock_fname, + O_CREAT | O_RDWR | O_CLOEXEC | O_NOFOLLOW, 0600); + if (group->lock_fd < 0) { + die("cannot open lock file for namespace group %s", group_name); + } + group->name = strdup(group_name); + if (group->name == NULL) { + die("cannot duplicate namespace group name %s", group_name); + } + return group; +} + +void sc_close_ns_group(struct sc_ns_group *group) +{ + debug("releasing resources associated wih namespace group %s", + group->name); + close(group->dir_fd); + close(group->lock_fd); + close(group->event_fd); + free(group->name); + free(group); +} + +void sc_lock_ns_mutex(struct sc_ns_group *group) +{ + if (group->lock_fd < 0) { + die("precondition failed: we don't have an open file descriptor for the mutex file"); + } + debug("acquiring exclusive lock for namespace group %s", group->name); + if (flock(group->lock_fd, LOCK_EX) < 0) { + die("cannot acquire exclusive lock for namespace group %s", + group->name); + } + debug("acquired exclusive lock for namespace group %s", group->name); +} + +void sc_unlock_ns_mutex(struct sc_ns_group *group) +{ + if (group->lock_fd < 0) { + die("precondition failed: we don't have an open file descriptor for the mutex file"); + } + debug("releasing lock for namespace group %s", group->name); + if (flock(group->lock_fd, LOCK_UN) < 0) { + die("cannot release lock for namespace group %s", group->name); + } + debug("released lock for namespace group %s", group->name); +} + +void sc_create_or_join_ns_group(struct sc_ns_group *group) +{ + // Open the mount namespace file. + char mnt_fname[PATH_MAX]; + must_snprintf(mnt_fname, sizeof mnt_fname, "%s%s", group->name, + SC_NS_MNT_FILE); + int mnt_fd __attribute__ ((cleanup(sc_cleanup_close))) = -1; + // NOTE: There is no O_EXCL here because the file can be around but + // doesn't have to be a mounted namespace. + // + // If the mounted namespace is discarded with + // sc_discard_preserved_ns_group() it will revert to a regular file. If + // snap-confine is killed for whatever reason after the file is created but + // before the file is bind-mounted it will also be a regular file. + mnt_fd = + openat(group->dir_fd, mnt_fname, + O_CREAT | O_RDONLY | O_CLOEXEC | O_NOFOLLOW, 0600); + if (mnt_fd < 0) { + die("cannot open mount namespace file for namespace group %s", + group->name); + } + // Check if we got an nsfs-based file or a regular file. This can be + // reliably tested because nsfs has an unique filesystem type NSFS_MAGIC. + // We can just ensure that this is the case thanks to fstatfs. + struct statfs buf; + if (fstatfs(mnt_fd, &buf) < 0) { + die("cannot perform fstatfs() on an mount namespace file descriptor"); + } + if (buf.f_type == NSFS_MAGIC) { + debug + ("attempting to re-associate the mount namespace with the namespace group %s", + group->name); + if (setns(mnt_fd, CLONE_NEWNS) < 0) { + die("cannot re-associate the mount namespace with namespace group %s", group->name); + } + debug + ("successfully re-associated the mount namespace with the namespace group %s", + group->name); + return; + } + debug("initializing new namespace group %s", group->name); + // Create a new namespace and ask the caller to populate it. + // For rationale of forking see this: + // https://lists.linuxfoundation.org/pipermail/containers/2013-August/033386.html + // + // The eventfd created here is used to synchronize the child and the parent + // processes. It effectively tells the child to perform the capture + // operation. + group->event_fd = eventfd(0, EFD_CLOEXEC); + if (group->event_fd < 0) { + die("cannot create eventfd for mount namespace capture"); + } + debug("forking support process for mount namespace capture"); + // Glibc defines pid as a signed 32bit integer. There's no standard way to + // print pid's portably so this is the best we can do. + pid_t pid = fork(); + debug("forked support process has pid %d", (int)pid); + if (pid < 0) { + die("cannot fork support process for mount namespace capture"); + } + if (pid == 0) { + // This is the child process which will capture the mount namespace. + // + // It will do so by bind-mounting the SC_NS_MNT_FILE after the parent + // process calls unshare() and finishes setting up the namespace + // completely. +#ifdef HAVE_APPARMOR + // Change the hat to a sub-profile that has limited permissions + // necessary to accomplish the capture of the mount namespace. + debug + ("changing apparmor hat of the support process for mount namespace capture"); + if (aa_change_hat("mount-namespace-capture-helper", 0) < 0) { + die("cannot change apparmor hat of the support process for mount namespace capture"); + } +#endif + // Configure the child to die as soon as the parent dies. In an odd + // case where the parent is killed then we don't want to complete our + // task or wait for anything. + if (prctl(PR_SET_PDEATHSIG, SIGINT, 0, 0, 0) < 0) { + die("cannot set parent process death notification signal to SIGINT"); + } + if (fchdir(group->dir_fd) < 0) { + die("cannot move process for mount namespace capture to namespace group directory"); + } + debug + ("waiting for a eventfd data from the parent process to continue"); + eventfd_t value = 0; + if (eventfd_read(group->event_fd, &value) < 0) { + die("cannot read expected data from eventfd"); + } + pid_t parent = getppid(); + debug + ("capturing mount namespace of process %d in namespace group %s", + (int)parent, group->name); + char src[PATH_MAX]; + char dst[PATH_MAX]; + must_snprintf(src, sizeof src, "/proc/%d/ns/mnt", (int)parent); + must_snprintf(dst, sizeof dst, "%s%s", group->name, + SC_NS_MNT_FILE); + if (mount(src, dst, NULL, MS_BIND, NULL) < 0) { + die("cannot bind-mount the mount namespace file %s -> %s", src, dst); + } + debug + ("successfully captured mount namespace in namespace group %s", + group->name); + exit(0); + } else { + group->child = pid; + // Unshare the mount namespace and set a flag instructing the caller that + // the namespace is pristine and needs to be populated now. + debug("unsharing the mount namespace"); + if (unshare(CLONE_NEWNS) < 0) { + die("cannot unshare the mount namespace"); + } + group->should_populate = true; + } +} + +bool sc_should_populate_ns_group(struct sc_ns_group *group) +{ + return group->should_populate; +} + +void sc_preserve_populated_ns_group(struct sc_ns_group *group) +{ + if (group->child == 0) { + die("precondition failed: we don't have a support process for mount namespace capture"); + } + if (group->event_fd < 0) { + die("precondition failed: we don't have an eventfd for mount namespace capture"); + } + debug + ("asking support process for mount namespace capture (pid: %d) to perform the capture", + group->child); + if (eventfd_write(group->event_fd, 1) < 0) { + die("cannot write eventfd"); + } + debug + ("waiting for the support process for mount namespace capture to exit"); + int status = 0; + errno = 0; + if (waitpid(group->child, &status, 0) < 0) { + die("cannot wait for the support process for mount namespace capture"); + } + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + die("support process for mount namespace capture exited abnormally"); + } + debug("support process for mount namespace capture exited normally"); + group->child = 0; +} + +void sc_discard_preserved_ns_group(struct sc_ns_group *group) +{ + // Remember the current working directory + int old_dir_fd __attribute__ ((cleanup(sc_cleanup_close))) = -1; + old_dir_fd = open(".", O_PATH | O_DIRECTORY | O_CLOEXEC); + if (old_dir_fd < 0) { + die("cannot open current directory"); + } + // Move to the mount namespace directory (/run/snapd/ns) + if (fchdir(group->dir_fd) < 0) { + die("cannot move to namespace group directory"); + } + // Unmount ${group_name}.mnt which holds the preserved namespace + char mnt_fname[PATH_MAX]; + must_snprintf(mnt_fname, sizeof mnt_fname, "%s%s", group->name, + SC_NS_MNT_FILE); + debug("unmounting preserved mount namespace file %s", mnt_fname); + if (umount2(mnt_fname, UMOUNT_NOFOLLOW) < 0) { + // EINVAL is returned when there's nothing to unmount (no bind-mount). + // Instead of checking for this explicitly (which is always racy) we + // just unmount and check the return code. + if (errno != EINVAL) { + die("cannot unmount preserved mount namespace file %s", + mnt_fname); + } + } + // Get back to the original directory + if (fchdir(old_dir_fd) < 0) { + die("cannot move back to original directory"); + } +} diff --git a/src/ns-support.h b/src/ns-support.h new file mode 100644 index 0000000..9eb7e01 --- /dev/null +++ b/src/ns-support.h @@ -0,0 +1,142 @@ +/* + * Copyright (C) 2016 Canonical Ltd + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 3 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + */ + +#ifndef SNAP_NAMESPACE_SUPPORT +#define SNAP_NAMESPACE_SUPPORT + +#include + +/** + * Initialize namespace sharing. + * + * This function must be called once in each process that wishes to create or + * join a namespace group. + * + * It is responsible for bind mounting the control directory over itself and + * making it private (unsharing it with all the other peers) so that it can be + * used for storing preserved namespaces as bind-mounted files from the nsfs + * filesystem (namespace filesystem). + * + * This function acquires a flock(2)-based lock to ensure that no other instance + * of snap-confine attempts to do this concurrently. If a process dies for any + * reason then the lock is released and other instances of snap-confine can + * complete the initialization. + * + * This function inspects /proc/self/mountinfo to determine if the directory + * where namespaces are kept (/run/snapd/ns) is correctly prepared as described + * above. + * + * For more details see namespaces(7). + **/ +void sc_initialize_ns_groups(); + +/** + * Data required to manage namespaces amongst a group of processes. + */ +struct sc_ns_group; + +/** + * Open a namespace group. + * + * This will open and keep file descriptors for /run/snapd/ns/ as well as for + * /run/snapd/ns/${group_name}.lock. The lock file is created if necessary but + * is not locked until sc_lock_ns_mutex() is called. + */ +struct sc_ns_group *sc_open_ns_group(const char *group_name); + +/** + * Close namespace group. + * + * This will close all of the open file descriptors and release allocated memory. + */ +void sc_close_ns_group(struct sc_ns_group *group); + +/** + * Acquire exclusive lock to the namespace group. + * + * This will attempt to acquire an flock-based exclusive lock on the file + * descriptor associated with /run/snapd/ns/${group_name}.lock. If the process + * is killed while the lock is held the lock is automatically released by the + * kernel. + * + * The following methods should be called only while holding the lock: + * - sc_create_or_join_ns_group() + * - sc_should_populate_ns_group() + * - sc_preserve_populated_ns_group() + * - sc_discard_preserved_ns_group() + **/ +void sc_lock_ns_mutex(struct sc_ns_group *group); + +/** + * Release lock to the namespace group. + * + * This will attempt to release a flock-based lock on the file descriptor + * associated with /run/snapd/ns/${group_name}.lock. + **/ +void sc_unlock_ns_mutex(struct sc_ns_group *group); + +/** + * Join the mount namespace associated with this group if one exists. + * + * Technically the function opens /run/snapd/ns/${group_name}.mnt and tries to + * use setns() with the obtained file descriptor. If the call succeeds then the + * function returns and subsequent call to sc_should_populate_ns_group() will + * return false. + * + * If the call fails then an eventfd is constructed and a support process is + * forked. The child process waits until data is written to the eventfd (this + * can be done by calling sc_preserve_populated_ns_group()). In the meantime + * the parent process unshares the mount namespace and sets a flag so that + * sc_should_populate_ns_group() returns true. + * + * @returns true if the mount namespace needs to be populated + **/ +void sc_create_or_join_ns_group(struct sc_ns_group *group); + +/** + * Check if the namespace needs to be populated. + * + * If the return value is true then at this stage the namespace is already + * unshared. The caller should perform any mount operations that are desired + * and then proceed to call sc_preserve_populated_ns_group(). + **/ +bool sc_should_populate_ns_group(struct sc_ns_group *group); + +/** + * Preserve prepared namespace group. + * + * This function signals the child support process for namespace capture to + * perform the capture and shut down. It must be called after the call to + * sc_create_or_join_ns_group() and only when sc_should_populate_ns_group() + * returns true. + * + * Technically this function writes to an eventfd that causes the child process + * to wake up, bind mount /proc/$ppid/ns/mnt to /run/snapd/ns/${group_name}.mnt + * and then exit. The parent process (the caller) then collects the child + * process and returns. + **/ +void sc_preserve_populated_ns_group(struct sc_ns_group *group); + +/** + * Discard the preserved namespace group. + * + * This function unmounts the bind-mounted files representing the kernel mount + * namespace. + **/ +void sc_discard_preserved_ns_group(struct sc_ns_group *group); + +#endif diff --git a/src/unit-tests.c b/src/unit-tests.c index 49cf8af..783809d 100644 --- a/src/unit-tests.c +++ b/src/unit-tests.c @@ -30,6 +30,7 @@ static void simple_test_case(void) int sc_run_unit_tests(int *argc, char ***argv) { g_test_init(argc, argv, NULL); + g_test_set_nonfatal_assertions(); g_test_add_func("/Simple Test Case", simple_test_case); return g_test_run(); }