Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 30 additions & 6 deletions calico/_includes/charts/calico/templates/calico-node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,25 @@ spec:
{{- end }}
securityContext:
privileged: true
# This init container mounts the necessary filesystems needed by the BPF data plane
# i.e. bpf at /sys/fs/bpf and cgroup2 at /run/calico/cgroup. Calico-node initialisation is executed
# in best effort fashion, i.e. no failure for errors, to not disrupt pod creation in iptable mode.
- name: "mount-bpffs"
image: {{.Values.node.image}}:{{.Values.node.tag}}
command: ["calico-node", "-init", "-best-effort"]
volumeMounts:
- mountPath: /sys/fs
name: sys-fs
# Bidirectional is required to ensure that the new mount we make at /sys/fs/bpf propagates to the host
# so that it outlives the init container.
mountPropagation: Bidirectional
# Mount /proc/1/ from host which usually is an init program at /initproc. It's needed by mountns binary,
# executed by calico-node, to mount root cgroup2 fs at /run/calico/cgroup to attach CTLB programs correctly.
- mountPath: /initproc
name: init-proc
readOnly: true
securityContext:
privileged: true
containers:
# Runs {{ include "nodeName" . }} container on each Kubernetes node. This
# container programs network policy and routes on each
Expand Down Expand Up @@ -415,11 +434,8 @@ spec:
mountPath: /var/run/nodeagent
# For eBPF mode, we need to be able to mount the BPF filesystem at /sys/fs/bpf so we mount in the
# parent directory.
- name: sysfs
mountPath: /sys/fs/
# Bidirectional means that, if we mount the BPF filesystem at /sys/fs/bpf it will propagate to the host.
# If the host is known to mount that filesystem already then Bidirectional can be omitted.
mountPropagation: Bidirectional
- name: bpffs
mountPath: /sys/fs/bpf
- name: cni-log-dir
mountPath: /var/log/calico/cni
readOnly: true
Expand Down Expand Up @@ -552,10 +568,18 @@ spec:
hostPath:
path: /run/xtables.lock
type: FileOrCreate
- name: sysfs
- name: sys-fs
hostPath:
path: /sys/fs/
type: DirectoryOrCreate
- name: bpffs
hostPath:
path: /sys/fs/bpf
type: Directory
# mount /proc/1 at /initproc to be used by mount-bpffs initContainer to mount root cgroup2 fs.
- name: init-proc
hostPath:
path: /proc/1
{{- if and (eq .Values.network "flannel") (eq .Values.datastore "kubernetes") }}
# Used by flannel.
- name: flannel-cfg
Expand Down
37 changes: 16 additions & 21 deletions felix/bpf/bpf.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,8 @@ const (
sockmapEndpointsMapVersion = "v1"
sockmapEndpointsMapName = "calico_sk_endpoints_" + sockmapEndpointsMapVersion

defaultBPFfsPath = "/sys/fs/bpf"
DefaultBPFfsPath = "/sys/fs/bpf"
CgroupV2Path = "/run/calico/cgroup"
)

var (
Expand Down Expand Up @@ -195,20 +196,20 @@ func NewBPFLib(binDir string) (*BPFLib, error) {

func MaybeMountBPFfs() (string, error) {
var err error
bpffsPath := defaultBPFfsPath
bpffsPath := DefaultBPFfsPath

mnt, err := isMount(defaultBPFfsPath)
mnt, err := isMount(DefaultBPFfsPath)
if err != nil {
return "", err
}

fsBPF, err := isBPF(defaultBPFfsPath)
fsBPF, err := isBPF(DefaultBPFfsPath)
if err != nil {
return "", err
}

if !mnt {
err = mountBPFfs(defaultBPFfsPath)
err = mountBPFfs(DefaultBPFfsPath)
} else if !fsBPF {
var runfsBPF bool

Expand All @@ -233,29 +234,27 @@ func MaybeMountBPFfs() (string, error) {

func MaybeMountCgroupV2() (string, error) {
var err error
cgroupV2Path := "/run/calico/cgroup"

if err := os.MkdirAll(cgroupV2Path, 0700); err != nil {
if err := os.MkdirAll(CgroupV2Path, 0700); err != nil {
return "", err
}

mnt, err := isMount(cgroupV2Path)
mnt, err := isMount(CgroupV2Path)
if err != nil {
return "", fmt.Errorf("error checking if %s is a mount: %v", cgroupV2Path, err)
return "", fmt.Errorf("error checking if %s is a mount: %v", CgroupV2Path, err)
}

fsCgroup, err := isCgroupV2(cgroupV2Path)
fsCgroup, err := isCgroupV2(CgroupV2Path)
if err != nil {
return "", fmt.Errorf("error checking if %s is CgroupV2: %v", cgroupV2Path, err)
return "", fmt.Errorf("error checking if %s is CgroupV2: %v", CgroupV2Path, err)
}

if !mnt {
err = mountCgroupV2(cgroupV2Path)
err = mountCgroupV2(CgroupV2Path)
} else if !fsCgroup {
err = fmt.Errorf("something that's not cgroup v2 is already mounted in %s", cgroupV2Path)
err = fmt.Errorf("something that's not cgroup v2 is already mounted in %s", CgroupV2Path)
}

return cgroupV2Path, err
return CgroupV2Path, err
}

func mountCgroupV2(path string) error {
Expand Down Expand Up @@ -290,25 +289,21 @@ func isMount(path string) (bool, error) {
}

func isBPF(path string) (bool, error) {
bpffsMagicNumber := uint32(0xCAFE4A11)

var fsdata unix.Statfs_t
if err := unix.Statfs(path, &fsdata); err != nil {
return false, fmt.Errorf("%s is not mounted", path)
}

return uint32(fsdata.Type) == bpffsMagicNumber, nil
return uint32(fsdata.Type) == uint32(unix.BPF_FS_MAGIC), nil
}

func isCgroupV2(path string) (bool, error) {
cgroup2MagicNumber := uint32(0x63677270)

var fsdata unix.Statfs_t
if err := unix.Statfs(path, &fsdata); err != nil {
return false, fmt.Errorf("%s is not mounted", path)
}

return uint32(fsdata.Type) == cgroup2MagicNumber, nil
return uint32(fsdata.Type) == uint32(unix.CGROUP2_SUPER_MAGIC), nil
}

func mountBPFfs(path string) error {
Expand Down
8 changes: 7 additions & 1 deletion node/Dockerfile.amd64
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2015-2021 Tigera, Inc. All rights reserved.
# Copyright (c) 2015-2022 Tigera, Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -205,6 +205,12 @@ COPY dist/bin/calico-node-amd64 /bin/calico-node
# Set the suid bit on calico-node
RUN chmod u+s /bin/calico-node

# Copy in the moutnns binary
COPY dist/bin/mountns-amd64 /bin/mountns

# Set the suid bit on mountns
RUN chmod u+s /bin/mountns

# Clean out as many files as we can from the filesystem. We no longer need dnf or the platform python install
# or any of its dependencies.
ADD clean-up-filesystem.sh /
Expand Down
8 changes: 7 additions & 1 deletion node/Dockerfile.arm64
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2015-2021 Tigera, Inc. All rights reserved.
# Copyright (c) 2015-2022 Tigera, Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -192,6 +192,12 @@ COPY --from=bpftool /bpftool /bin
# Copy in the calico-node binary
COPY dist/bin/calico-node-arm64 /bin/calico-node

# Copy in the moutnns binary
COPY dist/bin/mountns-arm64 /bin/mountns

# Set the suid bit on mountns
RUN chmod u+s /bin/mountns

# Clean out as many files as we can from the filesystem. We no longer need dnf or the platform python install
# or any of its dependencies.
ADD clean-up-filesystem.sh /
Expand Down
6 changes: 6 additions & 0 deletions node/Dockerfile.armv7
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,12 @@ COPY filesystem/ /
# Copy in the calico-node binary
COPY dist/bin/calico-node-${ARCH} /bin/calico-node

# Copy in the moutnns binary
COPY dist/bin/mountns-${ARCH} /bin/mountns

# Set the suid bit on mountns
RUN chmod u+s /bin/mountns

RUN rm /usr/bin/qemu-arm-static

CMD ["start_runit"]
Expand Down
6 changes: 6 additions & 0 deletions node/Dockerfile.ppc64le
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ COPY filesystem/ /
# Copy in the calico-node binary
COPY dist/bin/calico-node-${ARCH} /bin/calico-node

# Copy in the moutnns binary
COPY dist/bin/mountns-${ARCH} /bin/mountns

# Set the suid bit on mountns
RUN chmod u+s /bin/mountns

COPY --from=bpftool /bpftool /bin

RUN rm /usr/bin/qemu-${ARCH}-static
Expand Down
6 changes: 6 additions & 0 deletions node/Dockerfile.s390x
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@ COPY filesystem/ /
# Copy in the calico-node binary
COPY dist/bin/calico-node-${ARCH} /bin/calico-node

# Copy in the moutnns binary
COPY dist/bin/mountns-${ARCH} /bin/mountns

# Set the suid bit on mountns
RUN chmod u+s /bin/mountns

COPY --from=bpftool /bpftool /bin

RUN rm /usr/bin/qemu-${ARCH}-static
Expand Down
8 changes: 6 additions & 2 deletions node/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ NODE_CONTAINER_CREATED=.calico_node.created-$(ARCH)
NODE_CONTAINER_BIN_DIR=./dist/bin/
NODE_CONTAINER_BINARY = $(NODE_CONTAINER_BIN_DIR)/calico-node-$(ARCH)
WINDOWS_BINARY = $(NODE_CONTAINER_BIN_DIR)/calico-node.exe
TOOLS_MOUNTNS_BINARY = $(NODE_CONTAINER_BIN_DIR)/mountns-$(ARCH)

WINDOWS_INSTALL_SCRIPT := dist/install-calico-windows.ps1

Expand Down Expand Up @@ -195,7 +196,7 @@ clean-windows:
###############################################################################
# Building the binary
###############################################################################
build: $(NODE_CONTAINER_BINARY)
build: $(NODE_CONTAINER_BINARY) $(TOOLS_MOUNTNS_BINARY)

# Pull in config from confd.
filesystem/etc/calico/confd/conf.d: $(shell find ../confd/etc/calico/confd/conf.d -type f)
Expand Down Expand Up @@ -251,6 +252,9 @@ $(WINDOWS_ARCHIVE_ROOT)/cni/calico-ipam.exe:
$(CALICO_BUILD) sh -c '$(GIT_CONFIG_SSH) \
go build -v -o $@ $(LDFLAGS) ./cmd/calico-ipam'

$(TOOLS_MOUNTNS_BINARY):
$(DOCKER_GO_BUILD_CGO) sh -c '$(GIT_CONFIG_SSH) go build -v -o $@ $(BUILD_FLAGS) $(LDFLAGS) ./cmd/mountns'

###############################################################################
# Building the image
###############################################################################
Expand All @@ -260,7 +264,7 @@ sub-image-%:
$(MAKE) image ARCH=$*

image $(NODE_IMAGE): register $(NODE_CONTAINER_CREATED)
$(NODE_CONTAINER_CREATED): $(REMOTE_DEPS) ./Dockerfile.$(ARCH) $(NODE_CONTAINER_BINARY) $(INCLUDED_SOURCE) $(NODE_CONTAINER_FILES)
$(NODE_CONTAINER_CREATED): $(REMOTE_DEPS) ./Dockerfile.$(ARCH) $(NODE_CONTAINER_BINARY) $(INCLUDED_SOURCE) $(NODE_CONTAINER_FILES) $(TOOLS_MOUNTNS_BINARY)
$(DOCKER_BUILD) --build-arg BIRD_IMAGE=$(BIRD_IMAGE) -t $(NODE_IMAGE):latest-$(ARCH) -f ./Dockerfile.$(ARCH) . --load
$(MAKE) retag-build-images-with-registries VALIDARCHES=$(ARCH) IMAGETAG=latest
touch $@
Expand Down
5 changes: 4 additions & 1 deletion node/clean-up-filesystem.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
# Copyright (c) 2020 Tigera, Inc. All rights reserved.
# Copyright (c) 2020-2022 Tigera, Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -116,6 +116,9 @@ bin_allow_list_patterns=(
zless
zmore

# Needed for eBPF mode to mount the cgroupv2 filesystem on the host.
mountns

# Used by this script.
'/find$'
'/ldd$'
Expand Down
8 changes: 6 additions & 2 deletions node/cmd/calico-node/main.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2018,2021 Tigera, Inc. All rights reserved.
// Copyright (c) 2018-2022 Tigera, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -48,6 +48,7 @@ var version = flagSet.Bool("v", false, "Display version")
var runFelix = flagSet.Bool("felix", false, "Run Felix")
var runBPF = flagSet.Bool("bpf", false, "Run BPF debug tool")
var runInit = flagSet.Bool("init", false, "Do privileged initialisation of a new node (mount file systems etc).")
var bestEffort = flagSet.Bool("best-effort", false, "Used in combination with the init flag. Report errors but do not fail if an error occures during initialisation.")
var runStartup = flagSet.Bool("startup", false, "Do non-privileged start-up routine.")
var runWinUpgrade = flagSet.Bool("upgrade-windows", false, "Run Windows upgrade service.")
var runShouldInstallWindowsUpgrade = flagSet.Bool("should-install-windows-upgrade", false, "Check if Windows upgrade service should be installed.")
Expand Down Expand Up @@ -132,7 +133,10 @@ func main() {
bpf.RunBPFCmd()
} else if *runInit {
logrus.SetFormatter(&logutils.Formatter{Component: "init"})
nodeinit.Run()
if *bestEffort {
logrus.SetFormatter(&logutils.Formatter{Component: "init-best-effort"})
}
nodeinit.Run(*bestEffort)
} else if *runStartup {
logrus.SetFormatter(&logutils.Formatter{Component: "startup"})
startup.Run()
Expand Down
73 changes: 73 additions & 0 deletions node/cmd/mountns/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
//go:build cgo

// Copyright (c) 2022 Tigera, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package main

import (
"fmt"
"os"
"syscall"
)

// As more systems adopt cgroup2, k8s started to containerize each pod in a separate cgroup.
// This change prevents felix from attaching CTLB programs to cgroup ns correctly. To fix the issue,
// we need to mount root cgroup at /run/calico/cgroup (where felix expects it), not the one
// allocated by k8s to calico-node. This binary takes the following steps to solve it:
// - Enter the namespace root before mounting cgroup2 fs. (Usually, /proc/1/ns points to
// the root of all namespaces, however, we mount /proc/1 on host at /initproc on calico-node pod,
// so /initproc/ns is the root of namespaces.)
// - Mount root cgroups fs at /run/calico/cgroup.

// The following C code is executed as a cgo constructor which runs before the main function.
// The reason for this behavior is to set cgroup and mount namespace correctly, before mounting
// cgroup2 fs in the main function. Mount ns can only be changed in a single-thread process,
// so we need to change it by exploiting cgo constructor before Go runtime starts new threads.

// In addition, normal frameworks, like logrus, are not used in the go code to prevent:
// - any unexpected initialisation logic. This is important for setting mount ns
// correctly, as mentioned above.
// - unnecessary increase of the binary size, which currently is less than 2MB.

/*
#define _GNU_SOURCE
#include <sched.h>
#include <fcntl.h>

__attribute__((constructor)) void set_namespaces(void) {
// open /initproc/ns/cgroup, which is equivalent to /proc/1/ns/cgroup on host.
// Then run setns syscall to change the cgroup namespace to this value.
setns(open("/initproc/ns/cgroup", O_RDONLY, 0), CLONE_NEWCGROUP);

// open /initproc/ns/mnt, which is equivalent to /proc/1/ns/mnt on host.
// Then run setns syscall to change the mount namespace to this value.
setns(open("/initproc/ns/mnt", O_RDONLY, 0), CLONE_NEWNS);
} */
import "C"

func main() {
if len(os.Args) < 2 {
fmt.Printf("Usage: %s <mountpoint>\n", os.Args[0])
os.Exit(1)
}
mountPoint := os.Args[1]
fmt.Println("Trying to mount root cgroup fs.")
err := syscall.Mount("none", mountPoint, "cgroup2", 0, "")
if err != nil {
fmt.Printf("Failed to mount Cgroup filesystem. err: %v\n", err)
os.Exit(1)
}
fmt.Println("Successfully mounted root cgroup fs.")
}
Loading