From bb1fd12404b63203d369b25d52ea64a92a0d0d30 Mon Sep 17 00:00:00 2001 From: Akihiro Suda Date: Fri, 23 Jun 2023 22:02:27 +0900 Subject: [PATCH] [Carry 362] support detach-netns Planned to be used for: - accelerating (and deflaking) `nerdctl pull` and `nerdctl build` - supporting `nerdctl run --net=host` This commit is based on PR 362 (originally authored by Fahed Dorgaa), but almost rewritten from scratch. Co-authored-by: fahed dorgaa Signed-off-by: Akihiro Suda --- .github/workflows/main.yaml | 16 ++++ README.md | 1 + cmd/rootlesskit/main.go | 7 ++ docs/internal.md | 1 + docs/network.md | 6 ++ go.mod | 1 + go.sum | 10 +++ hack/benchmark-iperf3-net.sh | 18 +++- hack/benchmark-iperf3-port.sh | 6 +- pkg/child/child.go | 111 ++++++++++++++++++------- pkg/network/lxcusernic/lxcusernic.go | 11 ++- pkg/network/network.go | 7 +- pkg/network/parentutils/parentutils.go | 19 +++-- pkg/network/slirp4netns/slirp4netns.go | 17 +++- pkg/network/vpnkit/vpnkit.go | 57 ++++++++----- pkg/parent/parent.go | 21 +++-- pkg/port/builtin/child/child.go | 15 +++- pkg/port/port.go | 3 +- pkg/port/slirp4netns/slirp4netns.go | 2 +- pkg/port/testsuite/testsuite.go | 2 +- 20 files changed, 247 insertions(+), 84 deletions(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index f3a9d12f..f93e323e 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -48,12 +48,20 @@ jobs: run: | docker run --rm --security-opt seccomp=unconfined --security-opt apparmor=unconfined --device /dev/net/tun \ rootlesskit:test-integration ./benchmark-iperf3-net.sh slirp4netns 1500 --slirp4netns-sandbox=auto --slirp4netns-seccomp=auto + - name: "Benchmark: Network (MTU=1500, network driver=slirp4netns with sandbox and seccomp) with detach-netns" + run: | + docker run --rm --security-opt seccomp=unconfined --security-opt apparmor=unconfined --device /dev/net/tun \ + rootlesskit:test-integration ./benchmark-iperf3-net.sh slirp4netns 1500 --slirp4netns-sandbox=auto --slirp4netns-seccomp=auto --detach-netns # NOTE: MTU greater than 16424 is known not to work for VPNKit. # Also, MTU greather than 4K might not be effective for VPNKit: https://twitter.com/mugofsoup/status/1017665057738641408 - name: "Benchmark: Network (MTU=1500, network driver=vpnkit)" run: | docker run --rm --security-opt seccomp=unconfined --security-opt apparmor=unconfined --device /dev/net/tun \ rootlesskit:test-integration ./benchmark-iperf3-net.sh vpnkit 1500 + - name: "Benchmark: Network (MTU=1500, network driver=vpnkit) with detach-netns" + run: | + docker run --rm --security-opt seccomp=unconfined --security-opt apparmor=unconfined --device /dev/net/tun \ + rootlesskit:test-integration ./benchmark-iperf3-net.sh vpnkit 1500 --detach-netns - name: "Benchmark: Network (MTU=1500, network driver=lxc-user-nic)" run: | docker run --rm --privileged \ @@ -84,10 +92,18 @@ jobs: run: | docker run --rm --security-opt seccomp=unconfined --security-opt apparmor=unconfined --device /dev/net/tun \ rootlesskit:test-integration ./benchmark-iperf3-port.sh slirp4netns + - name: "Benchmark: TCP Ports (port driver=slirp4netns) with detach-netns" + run: | + docker run --rm --security-opt seccomp=unconfined --security-opt apparmor=unconfined --device /dev/net/tun \ + rootlesskit:test-integration ./benchmark-iperf3-port.sh slirp4netns --detach-netns - name: "Benchmark: TCP Ports (port driver=builtin)" run: | docker run --rm --security-opt seccomp=unconfined --security-opt apparmor=unconfined --device /dev/net/tun \ rootlesskit:test-integration ./benchmark-iperf3-port.sh builtin + - name: "Benchmark: TCP Ports (port driver=builtin) with detach-netns" + run: | + docker run --rm --security-opt seccomp=unconfined --security-opt apparmor=unconfined --device /dev/net/tun \ + rootlesskit:test-integration ./benchmark-iperf3-port.sh builtin --detach-netns # ===== Benchmark: UDP Ports ===== - name: "Benchmark: UDP Ports (port driver=builtin)" run: | diff --git a/README.md b/README.md index 9f29025a..9a048b35 100644 --- a/README.md +++ b/README.md @@ -234,6 +234,7 @@ The following files will be created in the state directory, which can be specifi * `lock`: lock file * `child_pid`: decimal PID text that can be used for `nsenter(1)`. * `api.sock`: REST API socket. See [`./docs/api.md`](./docs/api.md) and [`./docs/port.md`](./docs/port.md). +* `netns`: Detached NetNS. Created only with `--detach-netns`. Valid only in the child mount namespace. If `--state-dir` is not specified, RootlessKit creates a temporary state directory on `/tmp` and removes it on exit. diff --git a/cmd/rootlesskit/main.go b/cmd/rootlesskit/main.go index b2763341..1a5b2e7c 100644 --- a/cmd/rootlesskit/main.go +++ b/cmd/rootlesskit/main.go @@ -166,6 +166,10 @@ See https://rootlesscontaine.rs/getting-started/common/ . Name: "ipcns", Usage: "create an IPC namespace", }, CategoryProcess), + Categorize(&cli.BoolFlag{ + Name: "detach-netns", + Usage: "detach network namespaces ", + }, CategoryNetwork), Categorize(&cli.StringFlag{ Name: "propagation", Usage: "mount propagation [rprivate, rslave]", @@ -280,6 +284,7 @@ func createParentOpt(clicontext *cli.Context, pipeFDEnvKey, stateDirEnvKey, pare CreateCgroupNS: clicontext.Bool("cgroupns"), CreateUTSNS: clicontext.Bool("utsns"), CreateIPCNS: clicontext.Bool("ipcns"), + DetachNetNS: clicontext.Bool("detach-netns"), ParentEUIDEnvKey: parentEUIDEnvKey, ParentEGIDEnvKey: parentEGIDEnvKey, Propagation: clicontext.String("propagation"), @@ -492,11 +497,13 @@ func (w *logrusDebugWriter) Write(p []byte) (int, error) { func createChildOpt(clicontext *cli.Context, pipeFDEnvKey, stateDirEnvKey string, targetCmd []string) (child.Opt, error) { pidns := clicontext.Bool("pidns") + detachNetNS := clicontext.Bool("detach-netns") opt := child.Opt{ PipeFDEnvKey: pipeFDEnvKey, StateDirEnvKey: stateDirEnvKey, TargetCmd: targetCmd, MountProcfs: pidns, + DetachNetNS: detachNetNS, Propagation: clicontext.String("propagation"), EvacuateCgroup2: clicontext.String("evacuate-cgroup2") != "", } diff --git a/docs/internal.md b/docs/internal.md index d458b21c..b4b6b635 100644 --- a/docs/internal.md +++ b/docs/internal.md @@ -5,6 +5,7 @@ Common: - `lock` - `child_pid` - `api.sock` +- `netns` (detached netns) Network driver `slirp4netns`: - `.s4nn.sock` diff --git a/docs/network.md b/docs/network.md index 2d10ddbf..a041ba26 100644 --- a/docs/network.md +++ b/docs/network.md @@ -176,3 +176,9 @@ Currently, the MAC address is always set to a random address. The `--ipv6` flag (since v0.14.0, EXPERIMENTAL) enables IPv6 routing for slirp4netns network driver. This flag is unrelated to port forwarding. + +## Detaching network namespace +The `--detach-netns` flag (since v2.0.0) detaches network namespaces into `$ROOTLESSKIT_STATE_DIR/netns` +and executes the child command in the host's network namespace. + +The child command can enter `$ROOTLESSKIT_STATE_DIR/netns` by itself to create nested network namespaces. diff --git a/go.mod b/go.mod index 20ea3b26..b38fbed3 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module github.com/rootless-containers/rootlesskit go 1.19 require ( + github.com/containernetworking/plugins v1.3.0 github.com/gofrs/flock v0.8.1 github.com/google/uuid v1.3.0 github.com/gorilla/mux v1.8.0 diff --git a/go.sum b/go.sum index 8ce9ae16..07919154 100644 --- a/go.sum +++ b/go.sum @@ -1,13 +1,19 @@ +github.com/containernetworking/cni v1.1.2 h1:wtRGZVv7olUHMOqouPpn3cXJWpJgM6+EUl31EQbXALQ= +github.com/containernetworking/plugins v1.3.0 h1:QVNXMT6XloyMUoO2wUOqWTC1hWFV62Q6mVDp5H1HnjM= +github.com/containernetworking/plugins v1.3.0/go.mod h1:Pc2wcedTQQCVuROOOaLBPPxrEXqqXBFt3cZ+/yVg6l0= github.com/cpuguy83/go-md2man/v2 v2.0.2 h1:p1EgwI/C7NhT0JmVkwCD2ZBK8j4aeHQX2pMHHBfMQ6w= github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/go-logr/logr v1.2.4 h1:g01GSCwiDw2xSZfjJ2/T9M+S6pFdcNtFYsp+Y43HYDQ= +github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI= github.com/gofrs/flock v0.8.1 h1:+gYjHKf32LDeiEEFhQaotPbLuUXjY5ZqxKgXy7n59aw= github.com/gofrs/flock v0.8.1/go.mod h1:F1TvTiK9OcQqauNUHlbJvyl9Qa1QvF/gOUDKA14jxHU= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/pprof v0.0.0-20230323073829-e72429f035bd h1:r8yyd+DJDmsUhGrRBxH5Pj7KeFK5l+Y3FsgT8keqKtk= github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gorilla/mux v1.8.0 h1:i40aqfkR1h2SlN9hojwV5ZA91wcXFOvkdNIeFDP5koI= @@ -21,6 +27,8 @@ github.com/moby/sys/mountinfo v0.6.2 h1:BzJjoreD5BMFNmD9Rus6gdd1pLuecOFPt8wC+Vyg github.com/moby/sys/mountinfo v0.6.2/go.mod h1:IJb6JQeOklcdMU9F5xQ8ZALD+CUr5VlGpwtX+VE0rpI= github.com/moby/vpnkit v0.5.0 h1:VcDpS9y+PmT9itf+mH5Qdh9GME7ungLMt9yjf9o4REY= github.com/moby/vpnkit v0.5.0/go.mod h1:KyjUrL9cb6ZSNNAUwZfqRjhwwgJ3BJN+kXh0t43WTUQ= +github.com/onsi/ginkgo/v2 v2.9.2 h1:BA2GMJOtfGAfagzYtrAlufIP0lq6QERkFmHLMLPwFSU= +github.com/onsi/gomega v1.27.6 h1:ENqfyGeS5AX/rlXDd/ETokDz93u0YufY1Pgxuy/PvWE= github.com/pierrec/lz4/v4 v4.1.14/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= github.com/pierrec/lz4/v4 v4.1.17 h1:kV4Ip+/hUBC+8T6+2EgburRtkE9ef4nbY3f4dFhGjMc= github.com/pierrec/lz4/v4 v4.1.17/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= @@ -64,9 +72,11 @@ golang.org/x/sys v0.9.0 h1:KS/R3tvhPqvJvwcKfnBHJwwthS11LRhmM5D59eEXa0s= golang.org/x/sys v0.9.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.9.0 h1:2sjJmO8cDvYveuX97RDLsxlyUxLl+GHoLxBiRdHllBE= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.1.0/go.mod h1:xkSsbof2nBLbhDlRMhhhyNLN/zl3eTqcnHD5viDpcZ0= +golang.org/x/tools v0.7.0 h1:W4OVu8VVOaIO0yzWMNdepAulS7YfoS3Zabrm8DOXXU4= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/hack/benchmark-iperf3-net.sh b/hack/benchmark-iperf3-net.sh index 0f747396..3c8dd03f 100755 --- a/hack/benchmark-iperf3-net.sh +++ b/hack/benchmark-iperf3-net.sh @@ -2,26 +2,38 @@ source $(realpath $(dirname $0))/common.inc.sh function benchmark::iperf3::slirp4netns() { INFO "[benchmark:iperf3] slirp4netns ($@)" + statedir=$(mktemp -d) + if echo "$@" | grep -q -- --detach-netns; then + IPERF3C="nsenter -n${statedir}/netns $IPERF3C" + fi set -x - $ROOTLESSKIT --net=slirp4netns $@ -- $IPERF3C 10.0.2.2 + $ROOTLESSKIT --state-dir=$statedir --net=slirp4netns $@ -- $IPERF3C 10.0.2.2 set +x } function benchmark::iperf3::vpnkit() { INFO "[benchmark:iperf3] vpnkit ($@)" + statedir=$(mktemp -d) + if echo "$@" | grep -q -- --detach-netns; then + IPERF3C="nsenter -n${statedir}/netns $IPERF3C" + fi set -x - $ROOTLESSKIT --net=vpnkit $@ -- $IPERF3C 192.168.65.2 + $ROOTLESSKIT --state-dir=$statedir --net=vpnkit $@ -- $IPERF3C 192.168.65.2 set +x } function benchmark::iperf3::lxc-user-nic() { INFO "[benchmark:iperf3] lxc-user-nic ($@)" + statedir=$(mktemp -d) + if echo "$@" | grep -q -- --detach-netns; then + IPERF3C="nsenter -n${statedir}/netns $IPERF3C" + fi dev=lxcbr0 set -x # ignore "lxc-net is already running" error sudo /usr/lib/$(uname -m)-linux-gnu/lxc/lxc-net start || true ip=$(ip -4 -o addr show $dev | awk '{print $4}' | cut -d "/" -f 1) - $ROOTLESSKIT --net=lxc-user-nic $@ -- $IPERF3C $ip + $ROOTLESSKIT --state-dir=$statedir --net=lxc-user-nic $@ -- $IPERF3C $ip set +x } diff --git a/hack/benchmark-iperf3-port.sh b/hack/benchmark-iperf3-port.sh index a25f469f..e04c5f2b 100755 --- a/hack/benchmark-iperf3-port.sh +++ b/hack/benchmark-iperf3-port.sh @@ -3,7 +3,11 @@ source $(realpath $(dirname $0))/common.inc.sh function benchmark::iperf3::port() { statedir=$(mktemp -d) INFO "[benchmark:iperf3::port] $@" - $ROOTLESSKIT --state-dir=$statedir $@ iperf3 -s >/dev/null & + IPERF3="iperf3" + if echo "$@" | grep -q -- --detach-netns; then + IPERF3="nsenter -n${statedir}/netns $IPERF3" + fi + $ROOTLESSKIT --state-dir=$statedir $@ $IPERF3 -s >/dev/null & rkpid=$! # wait for socket to be available sleep 3 diff --git a/pkg/child/child.go b/pkg/child/child.go index f2937e93..57a071ca 100644 --- a/pkg/child/child.go +++ b/pkg/child/child.go @@ -7,14 +7,13 @@ import ( "os" "os/exec" "os/signal" + "path/filepath" "runtime" "strconv" "syscall" "time" - "github.com/sirupsen/logrus" - "golang.org/x/sys/unix" - + "github.com/containernetworking/plugins/pkg/ns" "github.com/rootless-containers/rootlesskit/pkg/common" "github.com/rootless-containers/rootlesskit/pkg/copyup" "github.com/rootless-containers/rootlesskit/pkg/messages" @@ -22,6 +21,8 @@ import ( "github.com/rootless-containers/rootlesskit/pkg/port" "github.com/rootless-containers/rootlesskit/pkg/sigproxy" sigproxysignal "github.com/rootless-containers/rootlesskit/pkg/sigproxy/signal" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" ) var propagationStates = map[string]uintptr{ @@ -152,39 +153,60 @@ func setupCopyDir(driver copyup.ChildDriver, dirs []string) (bool, error) { return false, nil } -func setupNet(stateDir string, msg *messages.ParentInitNetworkDriverCompleted, etcWasCopied bool, driver network.ChildDriver) error { +func setupNet(stateDir string, msg *messages.ParentInitNetworkDriverCompleted, etcWasCopied bool, driver network.ChildDriver, detachedNetNSPath string) error { // HostNetwork if driver == nil { return nil } - if err := activateLoopback(); err != nil { - return err - } - dev, err := driver.ConfigureNetworkChild(msg) - if err != nil { - return err - } - if err := activateDev(dev, msg.IP, msg.Netmask, msg.Gateway, msg.MTU); err != nil { - return err - } - if etcWasCopied { - if err := writeResolvConf(msg.DNS); err != nil { + + if detachedNetNSPath == "" { + // non-detached mode + if err := activateLoopback(); err != nil { + return err + } + dev, err := driver.ConfigureNetworkChild(msg, detachedNetNSPath) + if err != nil { return err } - if err := writeEtcHosts(); err != nil { + if err := activateDev(dev, msg.IP, msg.Netmask, msg.Gateway, msg.MTU); err != nil { return err } + if etcWasCopied { + if err := writeResolvConf(msg.DNS); err != nil { + return err + } + if err := writeEtcHosts(); err != nil { + return err + } + } else { + logrus.Warn("Mounting /etc/resolv.conf without copying-up /etc. " + + "Note that /etc/resolv.conf in the namespace will be unmounted when it is recreated on the host. " + + "Unless /etc/resolv.conf is statically configured, copying-up /etc is highly recommended. " + + "Please refer to RootlessKit documentation for further information.") + if err := mountResolvConf(stateDir, msg.DNS); err != nil { + return err + } + if err := mountEtcHosts(stateDir); err != nil { + return err + } + } } else { - logrus.Warn("Mounting /etc/resolv.conf without copying-up /etc. " + - "Note that /etc/resolv.conf in the namespace will be unmounted when it is recreated on the host. " + - "Unless /etc/resolv.conf is statically configured, copying-up /etc is highly recommended. " + - "Please refer to RootlessKit documentation for further information.") - if err := mountResolvConf(stateDir, msg.DNS); err != nil { + // detached mode + if err := ns.WithNetNSPath(detachedNetNSPath, func(_ ns.NetNS) error { + return activateLoopback() + }); err != nil { + return err + } + dev, err := driver.ConfigureNetworkChild(msg, detachedNetNSPath) + if err != nil { return err } - if err := mountEtcHosts(stateDir); err != nil { + if err := ns.WithNetNSPath(detachedNetNSPath, func(_ ns.NetNS) error { + return activateDev(dev, msg.IP, msg.Netmask, msg.Gateway, msg.MTU) + }); err != nil { return err } + // TODO: write /etc/resolv.conf and /etc/hosts in a custom directory? } return nil } @@ -196,6 +218,7 @@ type Opt struct { NetworkDriver network.ChildDriver // nil for HostNetwork CopyUpDriver copyup.ChildDriver // cannot be nil if len(CopyUpDirs) != 0 CopyUpDirs []string + DetachNetNS bool PortDriver port.ChildDriver MountProcfs bool // needs to be set if (and only if) parent.Opt.CreatePIDNS is set Propagation string // mount propagation type @@ -322,6 +345,20 @@ func Child(opt Opt) error { } } + if opt.MountProcfs { + if err := mountProcfs(); err != nil { + return err + } + } + + var detachedNetNSPath string + if opt.DetachNetNS { + detachedNetNSPath = filepath.Join(stateDir, "netns") + if err = NewNetNsWithPathWithoutEnter(detachedNetNSPath); err != nil { + return fmt.Errorf("failed to create a detached netns on %q: %w", detachedNetNSPath, err) + } + } + msgChildInitUserNSCompleted := &messages.Message{ U: messages.U{ ChildInitUserNSCompleted: &messages.ChildInitUserNSCompleted{}, @@ -362,17 +399,14 @@ func Child(opt Opt) error { if err != nil { return err } - if err := mountSysfs(opt.NetworkDriver == nil, opt.EvacuateCgroup2); err != nil { - return err - } - if err := setupNet(stateDir, netMsg, etcWasCopied, opt.NetworkDriver); err != nil { - return err - } - if opt.MountProcfs { - if err := mountProcfs(); err != nil { + if detachedNetNSPath == "" { + if err := mountSysfs(opt.NetworkDriver == nil, opt.EvacuateCgroup2); err != nil { return err } } + if err := setupNet(stateDir, netMsg, etcWasCopied, opt.NetworkDriver, detachedNetNSPath); err != nil { + return err + } portQuitCh := make(chan struct{}) portErrCh := make(chan error) if opt.PortDriver != nil { @@ -381,7 +415,7 @@ func Child(opt Opt) error { portDriverOpaque = portMsg.PortDriverOpaque } go func() { - portErrCh <- opt.PortDriver.RunChildDriver(portDriverOpaque, portQuitCh) + portErrCh <- opt.PortDriver.RunChildDriver(portDriverOpaque, portQuitCh, detachedNetNSPath) }() } @@ -484,3 +518,16 @@ func (e *reaperErr) Error() string { } return fmt.Sprintf("exited with WAITSTATUS=0x%08x", e.ws) } + +func NewNetNsWithPathWithoutEnter(p string) error { + if err := os.WriteFile(p, nil, 0400); err != nil { + return err + } + // this is hard (not impossible though) to reimplement in Go: https://github.com/cloudflare/slirpnetstack/commit/d7766a8a77f0093d3cb7a94bd0ccbe3f67d411ba + cmd := exec.Command("unshare", "-n", "mount", "--bind", "/proc/self/ns/net", p) + out, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("failed to execute %v: %w (out=%q)", cmd.Args, err, string(out)) + } + return nil +} diff --git a/pkg/network/lxcusernic/lxcusernic.go b/pkg/network/lxcusernic/lxcusernic.go index 3923c232..de5b1f36 100644 --- a/pkg/network/lxcusernic/lxcusernic.go +++ b/pkg/network/lxcusernic/lxcusernic.go @@ -68,7 +68,10 @@ func (d *parentDriver) MTU() int { return d.mtu } -func (d *parentDriver) ConfigureNetwork(childPID int, stateDir string) (*messages.ParentInitNetworkDriverCompleted, func() error, error) { +func (d *parentDriver) ConfigureNetwork(childPID int, stateDir, detachedNetNSPath string) (*messages.ParentInitNetworkDriverCompleted, func() error, error) { + if detachedNetNSPath != "" { + return nil, nil, fmt.Errorf("network driver %q does not support detach-netns", DriverName) + } var cleanups []func() error dummyLXCPath := "/dev/null" dummyLXCName := "dummy" @@ -127,7 +130,11 @@ func exchangeDHCP(c *client4.Client, dev string) (*dhcpv4.DHCPv4, error) { return ack, nil } -func (d *childDriver) ConfigureNetworkChild(netmsg *messages.ParentInitNetworkDriverCompleted) (string, error) { +func (d *childDriver) ConfigureNetworkChild(netmsg *messages.ParentInitNetworkDriverCompleted, detachedNetNSPath string) (string, error) { + if detachedNetNSPath != "" { + return "", fmt.Errorf("network driver %q does not support detach-netns", DriverName) + } + dev := netmsg.Dev if dev == "" { return "", errors.New("could not determine the dev") diff --git a/pkg/network/network.go b/pkg/network/network.go index a767c03a..5594c49f 100644 --- a/pkg/network/network.go +++ b/pkg/network/network.go @@ -13,12 +13,15 @@ type ParentDriver interface { // MTU returns MTU MTU() int // ConfigureNetwork sets up Slirp, updates msg, and returns destructor function. - ConfigureNetwork(childPID int, stateDir string) (netmsg *messages.ParentInitNetworkDriverCompleted, cleanup func() error, err error) + // detachedNetNSPath is set only for the detach-netns mode. + ConfigureNetwork(childPID int, stateDir, detachedNetNSPath string) (netmsg *messages.ParentInitNetworkDriverCompleted, cleanup func() error, err error) } // ChildDriver is called from the child namespace type ChildDriver interface { + // ConfigureNetworkChild is executed in the child's namespaces, excluding detached-netns. + // // netmsg MAY be modified. // devName is like "tap" or "eth0" - ConfigureNetworkChild(netmsg *messages.ParentInitNetworkDriverCompleted) (devName string, err error) + ConfigureNetworkChild(netmsg *messages.ParentInitNetworkDriverCompleted, detachedNetNSPath string) (devName string, err error) } diff --git a/pkg/network/parentutils/parentutils.go b/pkg/network/parentutils/parentutils.go index b83575c1..4ef7834a 100644 --- a/pkg/network/parentutils/parentutils.go +++ b/pkg/network/parentutils/parentutils.go @@ -2,17 +2,16 @@ package parentutils import ( "fmt" - "os" "strconv" "github.com/rootless-containers/rootlesskit/pkg/common" ) -func PrepareTap(pid int, tap string) error { +func PrepareTap(childPID int, childNetNsPath string, tap string) error { cmds := [][]string{ - nsenter(pid, []string{"ip", "tuntap", "add", "name", tap, "mode", "tap"}), - nsenter(pid, []string{"ip", "link", "set", tap, "up"}), + nsenter(childPID, childNetNsPath, []string{"ip", "tuntap", "add", "name", tap, "mode", "tap"}), + nsenter(childPID, childNetNsPath, []string{"ip", "link", "set", tap, "up"}), } if err := common.Execs(os.Stderr, os.Environ(), cmds); err != nil { return fmt.Errorf("executing %v: %w", cmds, err) @@ -20,6 +19,14 @@ func PrepareTap(pid int, tap string) error { return nil } -func nsenter(pid int, cmd []string) []string { - return append([]string{"nsenter", "-t", strconv.Itoa(pid), "-n", "-m", "-U", "--preserve-credentials"}, cmd...) +func nsenter(childPID int, childNetNsPath string, cmd []string) []string { + fullCmd := []string{"nsenter", "-t", strconv.Itoa(childPID)} + if childNetNsPath != "" { + fullCmd = append(fullCmd, "-n"+childNetNsPath) + } else { + fullCmd = append(fullCmd, "-n") + } + fullCmd = append(fullCmd, []string{"-m", "-U", "--preserve-credentials"}...) + fullCmd = append(fullCmd, cmd...) + return fullCmd } diff --git a/pkg/network/slirp4netns/slirp4netns.go b/pkg/network/slirp4netns/slirp4netns.go index d21b84cb..07607486 100644 --- a/pkg/network/slirp4netns/slirp4netns.go +++ b/pkg/network/slirp4netns/slirp4netns.go @@ -173,10 +173,10 @@ func (d *parentDriver) MTU() int { return d.mtu } -func (d *parentDriver) ConfigureNetwork(childPID int, stateDir string) (*messages.ParentInitNetworkDriverCompleted, func() error, error) { +func (d *parentDriver) ConfigureNetwork(childPID int, stateDir, netns string) (*messages.ParentInitNetworkDriverCompleted, func() error, error) { tap := d.ifname var cleanups []func() error - if err := parentutils.PrepareTap(childPID, tap); err != nil { + if err := parentutils.PrepareTap(childPID, netns, tap); err != nil { return nil, common.Seq(cleanups), fmt.Errorf("setting up tap %s: %w", tap, err) } readyR, readyW, err := os.Pipe() @@ -205,7 +205,16 @@ func (d *parentDriver) ConfigureNetwork(childPID int, stateDir string) (*message if d.enableIPv6 { opts = append(opts, "--enable-ipv6") } - cmd := exec.Command(d.binary, append(opts, []string{strconv.Itoa(childPID), tap}...)...) + if netns == "" { + opts = append(opts, strconv.Itoa(childPID)) + } else { + opts = append(opts, + fmt.Sprintf("--userns-path=/proc/%d/ns/user", childPID), + "--netns-type=path", + netns) + } + opts = append(opts, tap) + cmd := exec.Command(d.binary, opts...) // FIXME: Stdout doen't seem captured cmd.Stdout = d.logWriter cmd.Stderr = d.logWriter @@ -313,7 +322,7 @@ func NewChildDriver() network.ChildDriver { type childDriver struct { } -func (d *childDriver) ConfigureNetworkChild(netmsg *messages.ParentInitNetworkDriverCompleted) (string, error) { +func (d *childDriver) ConfigureNetworkChild(netmsg *messages.ParentInitNetworkDriverCompleted, detachedNetNSPath string) (string, error) { tap := netmsg.Dev if tap == "" { return "", errors.New("could not determine the preconfigured tap") diff --git a/pkg/network/vpnkit/vpnkit.go b/pkg/network/vpnkit/vpnkit.go index 3b5571c5..5446783d 100644 --- a/pkg/network/vpnkit/vpnkit.go +++ b/pkg/network/vpnkit/vpnkit.go @@ -14,6 +14,7 @@ import ( "syscall" "time" + "github.com/containernetworking/plugins/pkg/ns" "github.com/google/uuid" "github.com/moby/vpnkit/go/pkg/vmnet" @@ -84,7 +85,7 @@ func (d *parentDriver) MTU() int { return d.mtu } -func (d *parentDriver) ConfigureNetwork(childPID int, stateDir string) (*messages.ParentInitNetworkDriverCompleted, func() error, error) { +func (d *parentDriver) ConfigureNetwork(childPID int, stateDir, detachedNetNSPath string) (*messages.ParentInitNetworkDriverCompleted, func() error, error) { var cleanups []func() error vpnkitSocket := filepath.Join(stateDir, "vpnkit-ethernet.sock") vpnkitCtx, vpnkitCancel := context.WithCancel(context.Background()) @@ -171,7 +172,7 @@ func NewChildDriver() network.ChildDriver { type childDriver struct { } -func (d *childDriver) ConfigureNetworkChild(netmsg *messages.ParentInitNetworkDriverCompleted) (tap string, err error) { +func (d *childDriver) ConfigureNetworkChild(netmsg *messages.ParentInitNetworkDriverCompleted, detachedNetNSPath string) (tap string, err error) { tapName := netmsg.Dev if tapName == "" { return "", errors.New("no dev is set") @@ -188,30 +189,44 @@ func (d *childDriver) ConfigureNetworkChild(netmsg *messages.ParentInitNetworkDr if uuidStr == "" { return "", errors.New("no VPNKit UUID is set") } - return startVPNKitRoutines(context.TODO(), tapName, macStr, socket, uuidStr) + return startVPNKitRoutines(context.TODO(), tapName, macStr, socket, uuidStr, detachedNetNSPath) } -func startVPNKitRoutines(ctx context.Context, tapName, macStr, socket, uuidStr string) (string, error) { - cmds := [][]string{ - {"ip", "tuntap", "add", "name", tapName, "mode", "tap"}, - {"ip", "link", "set", tapName, "address", macStr}, - // IP stuff and MTU are configured in activateTap() in pkg/child/child.go - } - if err := common.Execs(os.Stderr, os.Environ(), cmds); err != nil { - return "", fmt.Errorf("executing %v: %w", cmds, err) +func startVPNKitRoutines(ctx context.Context, tapName, macStr, socket, uuidStr, detachedNetNSPath string) (string, error) { + var tap *water.Interface + fn := func(_ ns.NetNS) error { + cmds := [][]string{ + {"ip", "tuntap", "add", "name", tapName, "mode", "tap"}, + {"ip", "link", "set", tapName, "address", macStr}, + // IP stuff and MTU are configured in activateTap() in pkg/child/child.go + } + if err := common.Execs(os.Stderr, os.Environ(), cmds); err != nil { + return fmt.Errorf("executing %v: %w", cmds, err) + } + var err error + tap, err = water.New( + water.Config{ + DeviceType: water.TAP, + PlatformSpecificParams: water.PlatformSpecificParams{ + Name: tapName, + }, + }) + if err != nil { + return fmt.Errorf("creating tap %s: %w", tapName, err) + } + return nil } - tap, err := water.New( - water.Config{ - DeviceType: water.TAP, - PlatformSpecificParams: water.PlatformSpecificParams{ - Name: tapName, - }, - }) - if err != nil { - return "", fmt.Errorf("creating tap %s: %w", tapName, err) + if detachedNetNSPath == "" { + if err := fn(nil); err != nil { + return "", err + } + } else { + if err := ns.WithNetNSPath(detachedNetNSPath, fn); err != nil { + return "", err + } } if tap.Name() != tapName { - return "", fmt.Errorf("expected %q, got %q: %w", tapName, tap.Name(), err) + return "", fmt.Errorf("expected %q, got %q", tapName, tap.Name()) } vmnet, err := vmnet.New(ctx, socket) if err != nil { diff --git a/pkg/parent/parent.go b/pkg/parent/parent.go index 29eb7d70..f76da85f 100644 --- a/pkg/parent/parent.go +++ b/pkg/parent/parent.go @@ -15,10 +15,6 @@ import ( "github.com/gofrs/flock" "github.com/gorilla/mux" - - "github.com/sirupsen/logrus" - "golang.org/x/sys/unix" - "github.com/rootless-containers/rootlesskit/pkg/api/router" "github.com/rootless-containers/rootlesskit/pkg/messages" "github.com/rootless-containers/rootlesskit/pkg/network" @@ -28,6 +24,8 @@ import ( "github.com/rootless-containers/rootlesskit/pkg/port" "github.com/rootless-containers/rootlesskit/pkg/sigproxy" "github.com/rootless-containers/rootlesskit/pkg/sigproxy/signal" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" ) type Opt struct { @@ -41,6 +39,7 @@ type Opt struct { CreateCgroupNS bool CreateUTSNS bool CreateIPCNS bool + DetachNetNS bool ParentEUIDEnvKey string // optional env key to propagate geteuid() value ParentEGIDEnvKey string // optional env key to propagate getegid() value Propagation string @@ -61,6 +60,7 @@ const ( StateFileLock = "lock" StateFileChildPID = "child_pid" // decimal pid number text StateFileAPISock = "api.sock" // REST API Socket + StateFileNetNs = "netns" // rootlesskit network namespace ) func checkPreflight(opt Opt) error { @@ -155,9 +155,13 @@ func Parent(opt Opt) error { Pdeathsig: syscall.SIGKILL, Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS, } + if opt.NetworkDriver != nil { - cmd.SysProcAttr.Unshareflags |= syscall.CLONE_NEWNET + if !opt.DetachNetNS { + cmd.SysProcAttr.Unshareflags |= syscall.CLONE_NEWNET + } } + if opt.CreatePIDNS { // cannot be Unshareflags (panics) cmd.SysProcAttr.Cloneflags |= syscall.CLONE_NEWPID @@ -231,8 +235,13 @@ func Parent(opt Opt) error { ParentInitNetworkDriverCompleted: &messages.ParentInitNetworkDriverCompleted{}, }, } + if opt.NetworkDriver != nil { - netMsg, cleanupNetwork, err := opt.NetworkDriver.ConfigureNetwork(cmd.Process.Pid, opt.StateDir) + var netns string + if opt.DetachNetNS { + netns = filepath.Join("/proc", strconv.Itoa(cmd.Process.Pid), "root", filepath.Clean(opt.StateDir), "netns") + } + netMsg, cleanupNetwork, err := opt.NetworkDriver.ConfigureNetwork(cmd.Process.Pid, opt.StateDir, netns) if cleanupNetwork != nil { defer cleanupNetwork() } diff --git a/pkg/port/builtin/child/child.go b/pkg/port/builtin/child/child.go index 86cc0281..ca6421db 100644 --- a/pkg/port/builtin/child/child.go +++ b/pkg/port/builtin/child/child.go @@ -11,6 +11,7 @@ import ( "golang.org/x/sys/unix" + "github.com/containernetworking/plugins/pkg/ns" "github.com/rootless-containers/rootlesskit/pkg/lowlevelmsgutil" "github.com/rootless-containers/rootlesskit/pkg/port" "github.com/rootless-containers/rootlesskit/pkg/port/builtin/msg" @@ -27,7 +28,7 @@ type childDriver struct { logWriter io.Writer } -func (d *childDriver) RunChildDriver(opaque map[string]string, quit <-chan struct{}) error { +func (d *childDriver) RunChildDriver(opaque map[string]string, quit <-chan struct{}, detachedNetNSPath string) error { socketPath := opaque[opaquepkg.SocketPath] if socketPath == "" { return errors.New("socket path not set") @@ -68,7 +69,7 @@ func (d *childDriver) RunChildDriver(opaque map[string]string, quit <-chan struc return err } go func() { - if rerr := d.routine(c); rerr != nil { + if rerr := d.routine(c, detachedNetNSPath); rerr != nil { rep := msg.Reply{ Error: rerr.Error(), } @@ -79,7 +80,7 @@ func (d *childDriver) RunChildDriver(opaque map[string]string, quit <-chan struc } } -func (d *childDriver) routine(c *net.UnixConn) error { +func (d *childDriver) routine(c *net.UnixConn, detachedNetNSPath string) error { var req msg.Request if _, err := lowlevelmsgutil.UnmarshalFromReader(c, &req); err != nil { return err @@ -88,7 +89,13 @@ func (d *childDriver) routine(c *net.UnixConn) error { case msg.RequestTypeInit: return d.handleConnectInit(c, &req) case msg.RequestTypeConnect: - return d.handleConnectRequest(c, &req) + if detachedNetNSPath == "" { + return d.handleConnectRequest(c, &req) + } else { + return ns.WithNetNSPath(detachedNetNSPath, func(_ ns.NetNS) error { + return d.handleConnectRequest(c, &req) + }) + } default: return fmt.Errorf("unknown request type %q", req.Type) } diff --git a/pkg/port/port.go b/pkg/port/port.go index 367a9788..9581a9ee 100644 --- a/pkg/port/port.go +++ b/pkg/port/port.go @@ -55,5 +55,6 @@ type ParentDriver interface { } type ChildDriver interface { - RunChildDriver(opaque map[string]string, quit <-chan struct{}) error + // RunChildDriver is executed in the child's namespaces, excluding detached-netns. + RunChildDriver(opaque map[string]string, quit <-chan struct{}, detachedNetNSPath string) error } diff --git a/pkg/port/slirp4netns/slirp4netns.go b/pkg/port/slirp4netns/slirp4netns.go index 965f4387..7656a509 100644 --- a/pkg/port/slirp4netns/slirp4netns.go +++ b/pkg/port/slirp4netns/slirp4netns.go @@ -201,7 +201,7 @@ func NewChildDriver() port.ChildDriver { type childDriver struct { } -func (d *childDriver) RunChildDriver(opaque map[string]string, quit <-chan struct{}) error { +func (d *childDriver) RunChildDriver(opaque map[string]string, quit <-chan struct{}, detachedNetNSPath string) error { // NOP <-quit return nil diff --git a/pkg/port/testsuite/testsuite.go b/pkg/port/testsuite/testsuite.go index 18218bf9..94f2f621 100644 --- a/pkg/port/testsuite/testsuite.go +++ b/pkg/port/testsuite/testsuite.go @@ -41,7 +41,7 @@ func Main(m *testing.M, cf func() port.ChildDriver) { errCh := make(chan error) go func() { d := cf() - dErr := d.RunChildDriver(opaque, quit) + dErr := d.RunChildDriver(opaque, quit, "") errCh <- dErr }() quitFD, err := strconv.Atoi(os.Getenv(reexecKeyQuitFD))