From b2ad5dc5f809da9665b41c25d9ab6359a87ec942 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Tue, 27 Feb 2024 15:28:12 +0400 Subject: [PATCH] fix: workaround a race in CNI setup (talosctl cluster create) When provisioning VMs, each launch process sets up CNI network, and from time to time CNI setup fails with something like: ``` error provisioning CNI network: plugin type="firewall" failed (add): running [/sbin/iptables -t filter -N CNI-ADMIN --wait]: exit status 4: iptables v1.8.10 (nf_tables) ``` This a race condition in the CNI plugins, and it looks like there is no fix for it (see e.g. https://github.com/hashicorp/nomad/issues/8838). As a workaround, take a mutex around CNI operation to serialize them. CNI setup happens in different processes, so use a file-based mutex. Signed-off-by: Andrey Smirnov --- go.mod | 1 + go.sum | 2 + pkg/provision/providers/qemu/launch.go | 56 ++++++++++++++++++++++++-- 3 files changed, 56 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 61b184bcfe..8ac3306690 100644 --- a/go.mod +++ b/go.mod @@ -34,6 +34,7 @@ require ( github.com/Azure/azure-sdk-for-go/sdk/security/keyvault/azcertificates v1.1.0 github.com/Azure/azure-sdk-for-go/sdk/security/keyvault/azkeys v1.1.0 github.com/BurntSushi/toml v1.3.2 + github.com/alexflint/go-filemutex v1.2.0 github.com/aws/aws-sdk-go-v2/config v1.27.0 github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.15.0 github.com/aws/aws-sdk-go-v2/service/kms v1.28.1 diff --git a/go.sum b/go.sum index 0a3f0a96b0..a2fa2b6ded 100644 --- a/go.sum +++ b/go.sum @@ -82,6 +82,8 @@ github.com/ProtonMail/gopenpgp/v2 v2.7.5 h1:STOY3vgES59gNgoOt2w0nyHBjKViB/qSg7Nj github.com/ProtonMail/gopenpgp/v2 v2.7.5/go.mod h1:IhkNEDaxec6NyzSI0PlxapinnwPVIESk8/76da3Ct3g= github.com/adrg/xdg v0.4.0 h1:RzRqFcjH4nE5C6oTAxhBtoE2IRyjBSa62SCbyPidvls= github.com/adrg/xdg v0.4.0/go.mod h1:N6ag73EX4wyxeaoeHctc1mas01KZgsj5tYiAIwqJE/E= +github.com/alexflint/go-filemutex v1.2.0 h1:1v0TJPDtlhgpW4nJ+GvxCLSlUDC3+gW0CQQvlmfDR/s= +github.com/alexflint/go-filemutex v1.2.0/go.mod h1:mYyQSWvw9Tx2/H2n9qXPb52tTYfE0pZAWcBq5mK025c= github.com/apparentlymart/go-cidr v1.1.0 h1:2mAhrMoF+nhXqxTzSZMUzDHkLjmIHC+Zzn4tdgBZjnU= github.com/apparentlymart/go-cidr v1.1.0/go.mod h1:EBcsNrHc3zQeuaeCeCtQruQm+n9/YjEn/vI25Lg7Gwc= github.com/armon/circbuf v0.0.0-20190214190532-5111143e8da2 h1:7Ip0wMmLHLRJdrloDxZfhMm0xrLXZS8+COSu2bXmEQs= diff --git a/pkg/provision/providers/qemu/launch.go b/pkg/provision/providers/qemu/launch.go index b8908a4177..53cdbf7889 100644 --- a/pkg/provision/providers/qemu/launch.go +++ b/pkg/provision/providers/qemu/launch.go @@ -16,7 +16,9 @@ import ( "strconv" "strings" + "github.com/alexflint/go-filemutex" "github.com/containernetworking/cni/libcni" + "github.com/containernetworking/cni/pkg/types" types100 "github.com/containernetworking/cni/pkg/types/100" "github.com/containernetworking/plugins/pkg/ns" "github.com/containernetworking/plugins/pkg/testutils" @@ -94,6 +96,39 @@ type tpm2Config struct { StateDir string } +// withCNIOperationLocked ensures that CNI operations don't run concurrently. +// +// There are race conditions in the CNI plugins that can cause a failure if called concurrently. +func withCNIOperationLocked[T any](config *LaunchConfig, f func() (T, error)) (T, error) { + var zeroT T + + lock, err := filemutex.New(filepath.Join(config.StatePath, "cni.lock")) + if err != nil { + return zeroT, fmt.Errorf("failed to create CNI lock: %w", err) + } + + if err = lock.Lock(); err != nil { + return zeroT, fmt.Errorf("failed to acquire CNI lock: %w", err) + } + + defer func() { + if err := lock.Close(); err != nil { + log.Printf("failed to release CNI lock: %s", err) + } + }() + + return f() +} + +// withCNIOperationLockedNoResult ensures that CNI operations don't run concurrently. +func withCNIOperationLockedNoResult(config *LaunchConfig, f func() error) error { + _, err := withCNIOperationLocked(config, func() (struct{}, error) { + return struct{}{}, f() + }) + + return err +} + // withCNI creates network namespace, launches CNI and passes control to the next function // filling config with netNS and interface details. // @@ -134,18 +169,33 @@ func withCNI(ctx context.Context, config *LaunchConfig, f func(config *LaunchCon } // attempt to clean up network in case it was deployed previously - err = cniConfig.DelNetworkList(ctx, config.NetworkConfig, &runtimeConf) + err = withCNIOperationLockedNoResult( + config, + func() error { + return cniConfig.DelNetworkList(ctx, config.NetworkConfig, &runtimeConf) + }, + ) if err != nil { return fmt.Errorf("error deleting CNI network: %w", err) } - res, err := cniConfig.AddNetworkList(ctx, config.NetworkConfig, &runtimeConf) + res, err := withCNIOperationLocked( + config, + func() (types.Result, error) { + return cniConfig.AddNetworkList(ctx, config.NetworkConfig, &runtimeConf) + }, + ) if err != nil { return fmt.Errorf("error provisioning CNI network: %w", err) } defer func() { - if e := cniConfig.DelNetworkList(ctx, config.NetworkConfig, &runtimeConf); e != nil { + if e := withCNIOperationLockedNoResult( + config, + func() error { + return cniConfig.DelNetworkList(ctx, config.NetworkConfig, &runtimeConf) + }, + ); e != nil { log.Printf("error cleaning up CNI: %s", e) } }()