Skip to content

Commit

Permalink
fix: workaround a race in CNI setup (talosctl cluster create)
Browse files Browse the repository at this point in the history
When provisioning VMs, each launch process sets up CNI network, and from
time to time CNI setup fails with something like:

```
error provisioning CNI network: plugin type="firewall" failed (add): running [/sbin/iptables -t filter -N CNI-ADMIN --wait]: exit status 4: iptables v1.8.10 (nf_tables)
```

This a race condition in the CNI plugins, and it looks like there is no
fix for it (see e.g. hashicorp/nomad#8838).

As a workaround, take a mutex around CNI operation to serialize them.
CNI setup happens in different processes, so use a file-based mutex.

Signed-off-by: Andrey Smirnov <andrey.smirnov@siderolabs.com>
  • Loading branch information
smira committed Feb 27, 2024
1 parent 4575078 commit b2ad5dc
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 3 deletions.
1 change: 1 addition & 0 deletions go.mod
Expand Up @@ -34,6 +34,7 @@ require (
github.com/Azure/azure-sdk-for-go/sdk/security/keyvault/azcertificates v1.1.0
github.com/Azure/azure-sdk-for-go/sdk/security/keyvault/azkeys v1.1.0
github.com/BurntSushi/toml v1.3.2
github.com/alexflint/go-filemutex v1.2.0
github.com/aws/aws-sdk-go-v2/config v1.27.0
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.15.0
github.com/aws/aws-sdk-go-v2/service/kms v1.28.1
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Expand Up @@ -82,6 +82,8 @@ github.com/ProtonMail/gopenpgp/v2 v2.7.5 h1:STOY3vgES59gNgoOt2w0nyHBjKViB/qSg7Nj
github.com/ProtonMail/gopenpgp/v2 v2.7.5/go.mod h1:IhkNEDaxec6NyzSI0PlxapinnwPVIESk8/76da3Ct3g=
github.com/adrg/xdg v0.4.0 h1:RzRqFcjH4nE5C6oTAxhBtoE2IRyjBSa62SCbyPidvls=
github.com/adrg/xdg v0.4.0/go.mod h1:N6ag73EX4wyxeaoeHctc1mas01KZgsj5tYiAIwqJE/E=
github.com/alexflint/go-filemutex v1.2.0 h1:1v0TJPDtlhgpW4nJ+GvxCLSlUDC3+gW0CQQvlmfDR/s=
github.com/alexflint/go-filemutex v1.2.0/go.mod h1:mYyQSWvw9Tx2/H2n9qXPb52tTYfE0pZAWcBq5mK025c=
github.com/apparentlymart/go-cidr v1.1.0 h1:2mAhrMoF+nhXqxTzSZMUzDHkLjmIHC+Zzn4tdgBZjnU=
github.com/apparentlymart/go-cidr v1.1.0/go.mod h1:EBcsNrHc3zQeuaeCeCtQruQm+n9/YjEn/vI25Lg7Gwc=
github.com/armon/circbuf v0.0.0-20190214190532-5111143e8da2 h1:7Ip0wMmLHLRJdrloDxZfhMm0xrLXZS8+COSu2bXmEQs=
Expand Down
56 changes: 53 additions & 3 deletions pkg/provision/providers/qemu/launch.go
Expand Up @@ -16,7 +16,9 @@ import (
"strconv"
"strings"

"github.com/alexflint/go-filemutex"
"github.com/containernetworking/cni/libcni"
"github.com/containernetworking/cni/pkg/types"
types100 "github.com/containernetworking/cni/pkg/types/100"
"github.com/containernetworking/plugins/pkg/ns"
"github.com/containernetworking/plugins/pkg/testutils"
Expand Down Expand Up @@ -94,6 +96,39 @@ type tpm2Config struct {
StateDir string
}

// withCNIOperationLocked ensures that CNI operations don't run concurrently.
//
// There are race conditions in the CNI plugins that can cause a failure if called concurrently.
func withCNIOperationLocked[T any](config *LaunchConfig, f func() (T, error)) (T, error) {
var zeroT T

lock, err := filemutex.New(filepath.Join(config.StatePath, "cni.lock"))
if err != nil {
return zeroT, fmt.Errorf("failed to create CNI lock: %w", err)
}

if err = lock.Lock(); err != nil {
return zeroT, fmt.Errorf("failed to acquire CNI lock: %w", err)
}

defer func() {
if err := lock.Close(); err != nil {
log.Printf("failed to release CNI lock: %s", err)
}
}()

return f()
}

// withCNIOperationLockedNoResult ensures that CNI operations don't run concurrently.
func withCNIOperationLockedNoResult(config *LaunchConfig, f func() error) error {
_, err := withCNIOperationLocked(config, func() (struct{}, error) {
return struct{}{}, f()
})

return err
}

// withCNI creates network namespace, launches CNI and passes control to the next function
// filling config with netNS and interface details.
//
Expand Down Expand Up @@ -134,18 +169,33 @@ func withCNI(ctx context.Context, config *LaunchConfig, f func(config *LaunchCon
}

// attempt to clean up network in case it was deployed previously
err = cniConfig.DelNetworkList(ctx, config.NetworkConfig, &runtimeConf)
err = withCNIOperationLockedNoResult(
config,
func() error {
return cniConfig.DelNetworkList(ctx, config.NetworkConfig, &runtimeConf)
},
)
if err != nil {
return fmt.Errorf("error deleting CNI network: %w", err)
}

res, err := cniConfig.AddNetworkList(ctx, config.NetworkConfig, &runtimeConf)
res, err := withCNIOperationLocked(
config,
func() (types.Result, error) {
return cniConfig.AddNetworkList(ctx, config.NetworkConfig, &runtimeConf)
},
)
if err != nil {
return fmt.Errorf("error provisioning CNI network: %w", err)
}

defer func() {
if e := cniConfig.DelNetworkList(ctx, config.NetworkConfig, &runtimeConf); e != nil {
if e := withCNIOperationLockedNoResult(
config,
func() error {
return cniConfig.DelNetworkList(ctx, config.NetworkConfig, &runtimeConf)
},
); e != nil {
log.Printf("error cleaning up CNI: %s", e)
}
}()
Expand Down

0 comments on commit b2ad5dc

Please sign in to comment.