Skip to content

Commit

Permalink
[Carry 362] support detach-netns
Browse files Browse the repository at this point in the history
Planned to be used for:
- accelerating (and deflaking) `nerdctl pull` and `nerdctl build`
- supporting `nerdctl run --net=host`

This commit is based on PR 362 (originally authored by Fahed Dorgaa),
but almost rewritten from scratch.

Co-authored-by: fahed dorgaa <fahed.dorgaa@gmail.com>
Signed-off-by: Akihiro Suda <akihiro.suda.cz@hco.ntt.co.jp>
  • Loading branch information
AkihiroSuda and fahedouch committed Jun 29, 2023
1 parent 2ba2ca7 commit bb1fd12
Show file tree
Hide file tree
Showing 20 changed files with 247 additions and 84 deletions.
16 changes: 16 additions & 0 deletions .github/workflows/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,20 @@ jobs:
run: |
docker run --rm --security-opt seccomp=unconfined --security-opt apparmor=unconfined --device /dev/net/tun \
rootlesskit:test-integration ./benchmark-iperf3-net.sh slirp4netns 1500 --slirp4netns-sandbox=auto --slirp4netns-seccomp=auto
- name: "Benchmark: Network (MTU=1500, network driver=slirp4netns with sandbox and seccomp) with detach-netns"
run: |
docker run --rm --security-opt seccomp=unconfined --security-opt apparmor=unconfined --device /dev/net/tun \
rootlesskit:test-integration ./benchmark-iperf3-net.sh slirp4netns 1500 --slirp4netns-sandbox=auto --slirp4netns-seccomp=auto --detach-netns
# NOTE: MTU greater than 16424 is known not to work for VPNKit.
# Also, MTU greather than 4K might not be effective for VPNKit: https://twitter.com/mugofsoup/status/1017665057738641408
- name: "Benchmark: Network (MTU=1500, network driver=vpnkit)"
run: |
docker run --rm --security-opt seccomp=unconfined --security-opt apparmor=unconfined --device /dev/net/tun \
rootlesskit:test-integration ./benchmark-iperf3-net.sh vpnkit 1500
- name: "Benchmark: Network (MTU=1500, network driver=vpnkit) with detach-netns"
run: |
docker run --rm --security-opt seccomp=unconfined --security-opt apparmor=unconfined --device /dev/net/tun \
rootlesskit:test-integration ./benchmark-iperf3-net.sh vpnkit 1500 --detach-netns
- name: "Benchmark: Network (MTU=1500, network driver=lxc-user-nic)"
run: |
docker run --rm --privileged \
Expand Down Expand Up @@ -84,10 +92,18 @@ jobs:
run: |
docker run --rm --security-opt seccomp=unconfined --security-opt apparmor=unconfined --device /dev/net/tun \
rootlesskit:test-integration ./benchmark-iperf3-port.sh slirp4netns
- name: "Benchmark: TCP Ports (port driver=slirp4netns) with detach-netns"
run: |
docker run --rm --security-opt seccomp=unconfined --security-opt apparmor=unconfined --device /dev/net/tun \
rootlesskit:test-integration ./benchmark-iperf3-port.sh slirp4netns --detach-netns
- name: "Benchmark: TCP Ports (port driver=builtin)"
run: |
docker run --rm --security-opt seccomp=unconfined --security-opt apparmor=unconfined --device /dev/net/tun \
rootlesskit:test-integration ./benchmark-iperf3-port.sh builtin
- name: "Benchmark: TCP Ports (port driver=builtin) with detach-netns"
run: |
docker run --rm --security-opt seccomp=unconfined --security-opt apparmor=unconfined --device /dev/net/tun \
rootlesskit:test-integration ./benchmark-iperf3-port.sh builtin --detach-netns
# ===== Benchmark: UDP Ports =====
- name: "Benchmark: UDP Ports (port driver=builtin)"
run: |
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ The following files will be created in the state directory, which can be specifi
* `lock`: lock file
* `child_pid`: decimal PID text that can be used for `nsenter(1)`.
* `api.sock`: REST API socket. See [`./docs/api.md`](./docs/api.md) and [`./docs/port.md`](./docs/port.md).
* `netns`: Detached NetNS. Created only with `--detach-netns`. Valid only in the child mount namespace.

If `--state-dir` is not specified, RootlessKit creates a temporary state directory on `/tmp` and removes it on exit.

Expand Down
7 changes: 7 additions & 0 deletions cmd/rootlesskit/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,10 @@ See https://rootlesscontaine.rs/getting-started/common/ .
Name: "ipcns",
Usage: "create an IPC namespace",
}, CategoryProcess),
Categorize(&cli.BoolFlag{
Name: "detach-netns",
Usage: "detach network namespaces ",
}, CategoryNetwork),
Categorize(&cli.StringFlag{
Name: "propagation",
Usage: "mount propagation [rprivate, rslave]",
Expand Down Expand Up @@ -280,6 +284,7 @@ func createParentOpt(clicontext *cli.Context, pipeFDEnvKey, stateDirEnvKey, pare
CreateCgroupNS: clicontext.Bool("cgroupns"),
CreateUTSNS: clicontext.Bool("utsns"),
CreateIPCNS: clicontext.Bool("ipcns"),
DetachNetNS: clicontext.Bool("detach-netns"),
ParentEUIDEnvKey: parentEUIDEnvKey,
ParentEGIDEnvKey: parentEGIDEnvKey,
Propagation: clicontext.String("propagation"),
Expand Down Expand Up @@ -492,11 +497,13 @@ func (w *logrusDebugWriter) Write(p []byte) (int, error) {

func createChildOpt(clicontext *cli.Context, pipeFDEnvKey, stateDirEnvKey string, targetCmd []string) (child.Opt, error) {
pidns := clicontext.Bool("pidns")
detachNetNS := clicontext.Bool("detach-netns")
opt := child.Opt{
PipeFDEnvKey: pipeFDEnvKey,
StateDirEnvKey: stateDirEnvKey,
TargetCmd: targetCmd,
MountProcfs: pidns,
DetachNetNS: detachNetNS,
Propagation: clicontext.String("propagation"),
EvacuateCgroup2: clicontext.String("evacuate-cgroup2") != "",
}
Expand Down
1 change: 1 addition & 0 deletions docs/internal.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ Common:
- `lock`
- `child_pid`
- `api.sock`
- `netns` (detached netns)

Network driver `slirp4netns`:
- `.s4nn.sock`
Expand Down
6 changes: 6 additions & 0 deletions docs/network.md
Original file line number Diff line number Diff line change
Expand Up @@ -176,3 +176,9 @@ Currently, the MAC address is always set to a random address.

The `--ipv6` flag (since v0.14.0, EXPERIMENTAL) enables IPv6 routing for slirp4netns network driver.
This flag is unrelated to port forwarding.

## Detaching network namespace
The `--detach-netns` flag (since v2.0.0) detaches network namespaces into `$ROOTLESSKIT_STATE_DIR/netns`
and executes the child command in the host's network namespace.

The child command can enter `$ROOTLESSKIT_STATE_DIR/netns` by itself to create nested network namespaces.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ module github.com/rootless-containers/rootlesskit
go 1.19

require (
github.com/containernetworking/plugins v1.3.0
github.com/gofrs/flock v0.8.1
github.com/google/uuid v1.3.0
github.com/gorilla/mux v1.8.0
Expand Down
10 changes: 10 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
github.com/containernetworking/cni v1.1.2 h1:wtRGZVv7olUHMOqouPpn3cXJWpJgM6+EUl31EQbXALQ=
github.com/containernetworking/plugins v1.3.0 h1:QVNXMT6XloyMUoO2wUOqWTC1hWFV62Q6mVDp5H1HnjM=
github.com/containernetworking/plugins v1.3.0/go.mod h1:Pc2wcedTQQCVuROOOaLBPPxrEXqqXBFt3cZ+/yVg6l0=
github.com/cpuguy83/go-md2man/v2 v2.0.2 h1:p1EgwI/C7NhT0JmVkwCD2ZBK8j4aeHQX2pMHHBfMQ6w=
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/go-logr/logr v1.2.4 h1:g01GSCwiDw2xSZfjJ2/T9M+S6pFdcNtFYsp+Y43HYDQ=
github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI=
github.com/gofrs/flock v0.8.1 h1:+gYjHKf32LDeiEEFhQaotPbLuUXjY5ZqxKgXy7n59aw=
github.com/gofrs/flock v0.8.1/go.mod h1:F1TvTiK9OcQqauNUHlbJvyl9Qa1QvF/gOUDKA14jxHU=
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/pprof v0.0.0-20230323073829-e72429f035bd h1:r8yyd+DJDmsUhGrRBxH5Pj7KeFK5l+Y3FsgT8keqKtk=
github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I=
github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/gorilla/mux v1.8.0 h1:i40aqfkR1h2SlN9hojwV5ZA91wcXFOvkdNIeFDP5koI=
Expand All @@ -21,6 +27,8 @@ github.com/moby/sys/mountinfo v0.6.2 h1:BzJjoreD5BMFNmD9Rus6gdd1pLuecOFPt8wC+Vyg
github.com/moby/sys/mountinfo v0.6.2/go.mod h1:IJb6JQeOklcdMU9F5xQ8ZALD+CUr5VlGpwtX+VE0rpI=
github.com/moby/vpnkit v0.5.0 h1:VcDpS9y+PmT9itf+mH5Qdh9GME7ungLMt9yjf9o4REY=
github.com/moby/vpnkit v0.5.0/go.mod h1:KyjUrL9cb6ZSNNAUwZfqRjhwwgJ3BJN+kXh0t43WTUQ=
github.com/onsi/ginkgo/v2 v2.9.2 h1:BA2GMJOtfGAfagzYtrAlufIP0lq6QERkFmHLMLPwFSU=
github.com/onsi/gomega v1.27.6 h1:ENqfyGeS5AX/rlXDd/ETokDz93u0YufY1Pgxuy/PvWE=
github.com/pierrec/lz4/v4 v4.1.14/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
github.com/pierrec/lz4/v4 v4.1.17 h1:kV4Ip+/hUBC+8T6+2EgburRtkE9ef4nbY3f4dFhGjMc=
github.com/pierrec/lz4/v4 v4.1.17/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
Expand Down Expand Up @@ -64,9 +72,11 @@ golang.org/x/sys v0.9.0 h1:KS/R3tvhPqvJvwcKfnBHJwwthS11LRhmM5D59eEXa0s=
golang.org/x/sys v0.9.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.9.0 h1:2sjJmO8cDvYveuX97RDLsxlyUxLl+GHoLxBiRdHllBE=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.0/go.mod h1:xkSsbof2nBLbhDlRMhhhyNLN/zl3eTqcnHD5viDpcZ0=
golang.org/x/tools v0.7.0 h1:W4OVu8VVOaIO0yzWMNdepAulS7YfoS3Zabrm8DOXXU4=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
Expand Down
18 changes: 15 additions & 3 deletions hack/benchmark-iperf3-net.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,38 @@
source $(realpath $(dirname $0))/common.inc.sh
function benchmark::iperf3::slirp4netns() {
INFO "[benchmark:iperf3] slirp4netns ($@)"
statedir=$(mktemp -d)
if echo "$@" | grep -q -- --detach-netns; then
IPERF3C="nsenter -n${statedir}/netns $IPERF3C"
fi
set -x
$ROOTLESSKIT --net=slirp4netns $@ -- $IPERF3C 10.0.2.2
$ROOTLESSKIT --state-dir=$statedir --net=slirp4netns $@ -- $IPERF3C 10.0.2.2
set +x
}

function benchmark::iperf3::vpnkit() {
INFO "[benchmark:iperf3] vpnkit ($@)"
statedir=$(mktemp -d)
if echo "$@" | grep -q -- --detach-netns; then
IPERF3C="nsenter -n${statedir}/netns $IPERF3C"
fi
set -x
$ROOTLESSKIT --net=vpnkit $@ -- $IPERF3C 192.168.65.2
$ROOTLESSKIT --state-dir=$statedir --net=vpnkit $@ -- $IPERF3C 192.168.65.2
set +x
}

function benchmark::iperf3::lxc-user-nic() {
INFO "[benchmark:iperf3] lxc-user-nic ($@)"
statedir=$(mktemp -d)
if echo "$@" | grep -q -- --detach-netns; then
IPERF3C="nsenter -n${statedir}/netns $IPERF3C"
fi
dev=lxcbr0
set -x
# ignore "lxc-net is already running" error
sudo /usr/lib/$(uname -m)-linux-gnu/lxc/lxc-net start || true
ip=$(ip -4 -o addr show $dev | awk '{print $4}' | cut -d "/" -f 1)
$ROOTLESSKIT --net=lxc-user-nic $@ -- $IPERF3C $ip
$ROOTLESSKIT --state-dir=$statedir --net=lxc-user-nic $@ -- $IPERF3C $ip
set +x
}

Expand Down
6 changes: 5 additions & 1 deletion hack/benchmark-iperf3-port.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@ source $(realpath $(dirname $0))/common.inc.sh
function benchmark::iperf3::port() {
statedir=$(mktemp -d)
INFO "[benchmark:iperf3::port] $@"
$ROOTLESSKIT --state-dir=$statedir $@ iperf3 -s >/dev/null &
IPERF3="iperf3"
if echo "$@" | grep -q -- --detach-netns; then
IPERF3="nsenter -n${statedir}/netns $IPERF3"
fi
$ROOTLESSKIT --state-dir=$statedir $@ $IPERF3 -s >/dev/null &
rkpid=$!
# wait for socket to be available
sleep 3
Expand Down
111 changes: 79 additions & 32 deletions pkg/child/child.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,22 @@ import (
"os"
"os/exec"
"os/signal"
"path/filepath"
"runtime"
"strconv"
"syscall"
"time"

"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"

"github.com/containernetworking/plugins/pkg/ns"
"github.com/rootless-containers/rootlesskit/pkg/common"
"github.com/rootless-containers/rootlesskit/pkg/copyup"
"github.com/rootless-containers/rootlesskit/pkg/messages"
"github.com/rootless-containers/rootlesskit/pkg/network"
"github.com/rootless-containers/rootlesskit/pkg/port"
"github.com/rootless-containers/rootlesskit/pkg/sigproxy"
sigproxysignal "github.com/rootless-containers/rootlesskit/pkg/sigproxy/signal"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)

var propagationStates = map[string]uintptr{
Expand Down Expand Up @@ -152,39 +153,60 @@ func setupCopyDir(driver copyup.ChildDriver, dirs []string) (bool, error) {
return false, nil
}

func setupNet(stateDir string, msg *messages.ParentInitNetworkDriverCompleted, etcWasCopied bool, driver network.ChildDriver) error {
func setupNet(stateDir string, msg *messages.ParentInitNetworkDriverCompleted, etcWasCopied bool, driver network.ChildDriver, detachedNetNSPath string) error {
// HostNetwork
if driver == nil {
return nil
}
if err := activateLoopback(); err != nil {
return err
}
dev, err := driver.ConfigureNetworkChild(msg)
if err != nil {
return err
}
if err := activateDev(dev, msg.IP, msg.Netmask, msg.Gateway, msg.MTU); err != nil {
return err
}
if etcWasCopied {
if err := writeResolvConf(msg.DNS); err != nil {

if detachedNetNSPath == "" {
// non-detached mode
if err := activateLoopback(); err != nil {
return err
}
dev, err := driver.ConfigureNetworkChild(msg, detachedNetNSPath)
if err != nil {
return err
}
if err := writeEtcHosts(); err != nil {
if err := activateDev(dev, msg.IP, msg.Netmask, msg.Gateway, msg.MTU); err != nil {
return err
}
if etcWasCopied {
if err := writeResolvConf(msg.DNS); err != nil {
return err
}
if err := writeEtcHosts(); err != nil {
return err
}
} else {
logrus.Warn("Mounting /etc/resolv.conf without copying-up /etc. " +
"Note that /etc/resolv.conf in the namespace will be unmounted when it is recreated on the host. " +
"Unless /etc/resolv.conf is statically configured, copying-up /etc is highly recommended. " +
"Please refer to RootlessKit documentation for further information.")
if err := mountResolvConf(stateDir, msg.DNS); err != nil {
return err
}
if err := mountEtcHosts(stateDir); err != nil {
return err
}
}
} else {
logrus.Warn("Mounting /etc/resolv.conf without copying-up /etc. " +
"Note that /etc/resolv.conf in the namespace will be unmounted when it is recreated on the host. " +
"Unless /etc/resolv.conf is statically configured, copying-up /etc is highly recommended. " +
"Please refer to RootlessKit documentation for further information.")
if err := mountResolvConf(stateDir, msg.DNS); err != nil {
// detached mode
if err := ns.WithNetNSPath(detachedNetNSPath, func(_ ns.NetNS) error {
return activateLoopback()
}); err != nil {
return err
}
dev, err := driver.ConfigureNetworkChild(msg, detachedNetNSPath)
if err != nil {
return err
}
if err := mountEtcHosts(stateDir); err != nil {
if err := ns.WithNetNSPath(detachedNetNSPath, func(_ ns.NetNS) error {
return activateDev(dev, msg.IP, msg.Netmask, msg.Gateway, msg.MTU)
}); err != nil {
return err
}
// TODO: write /etc/resolv.conf and /etc/hosts in a custom directory?
}
return nil
}
Expand All @@ -196,6 +218,7 @@ type Opt struct {
NetworkDriver network.ChildDriver // nil for HostNetwork
CopyUpDriver copyup.ChildDriver // cannot be nil if len(CopyUpDirs) != 0
CopyUpDirs []string
DetachNetNS bool
PortDriver port.ChildDriver
MountProcfs bool // needs to be set if (and only if) parent.Opt.CreatePIDNS is set
Propagation string // mount propagation type
Expand Down Expand Up @@ -322,6 +345,20 @@ func Child(opt Opt) error {
}
}

if opt.MountProcfs {
if err := mountProcfs(); err != nil {
return err
}
}

var detachedNetNSPath string
if opt.DetachNetNS {
detachedNetNSPath = filepath.Join(stateDir, "netns")
if err = NewNetNsWithPathWithoutEnter(detachedNetNSPath); err != nil {
return fmt.Errorf("failed to create a detached netns on %q: %w", detachedNetNSPath, err)
}
}

msgChildInitUserNSCompleted := &messages.Message{
U: messages.U{
ChildInitUserNSCompleted: &messages.ChildInitUserNSCompleted{},
Expand Down Expand Up @@ -362,17 +399,14 @@ func Child(opt Opt) error {
if err != nil {
return err
}
if err := mountSysfs(opt.NetworkDriver == nil, opt.EvacuateCgroup2); err != nil {
return err
}
if err := setupNet(stateDir, netMsg, etcWasCopied, opt.NetworkDriver); err != nil {
return err
}
if opt.MountProcfs {
if err := mountProcfs(); err != nil {
if detachedNetNSPath == "" {
if err := mountSysfs(opt.NetworkDriver == nil, opt.EvacuateCgroup2); err != nil {
return err
}
}
if err := setupNet(stateDir, netMsg, etcWasCopied, opt.NetworkDriver, detachedNetNSPath); err != nil {
return err
}
portQuitCh := make(chan struct{})
portErrCh := make(chan error)
if opt.PortDriver != nil {
Expand All @@ -381,7 +415,7 @@ func Child(opt Opt) error {
portDriverOpaque = portMsg.PortDriverOpaque
}
go func() {
portErrCh <- opt.PortDriver.RunChildDriver(portDriverOpaque, portQuitCh)
portErrCh <- opt.PortDriver.RunChildDriver(portDriverOpaque, portQuitCh, detachedNetNSPath)
}()
}

Expand Down Expand Up @@ -484,3 +518,16 @@ func (e *reaperErr) Error() string {
}
return fmt.Sprintf("exited with WAITSTATUS=0x%08x", e.ws)
}

func NewNetNsWithPathWithoutEnter(p string) error {
if err := os.WriteFile(p, nil, 0400); err != nil {
return err
}
// this is hard (not impossible though) to reimplement in Go: https://github.com/cloudflare/slirpnetstack/commit/d7766a8a77f0093d3cb7a94bd0ccbe3f67d411ba
cmd := exec.Command("unshare", "-n", "mount", "--bind", "/proc/self/ns/net", p)
out, err := cmd.CombinedOutput()
if err != nil {
return fmt.Errorf("failed to execute %v: %w (out=%q)", cmd.Args, err, string(out))
}
return nil
}

0 comments on commit bb1fd12

Please sign in to comment.