Skip to content

Commit

Permalink
Fix non-encap routes on parent device change (#8279)
Browse files Browse the repository at this point in the history
Previously, there were two problems:

- The routing table was not updated to use the new parent.
- After restarting felix, the routing table was unable to clean up the old routes.

This PR fixes the first (and has a feature flag that helps with the second):

- VXLAN manager now recreates the RouteTable when the parent device changes.
- To make that safe, move the creation to the main goroutine, with a channel to send the new parent name.
- When making a new RouteTable, include previous parent interface names in the regex so that the old routes get cleaned up. This only works if Felix is not restarted at the wrong moment.
- As belt-and-braces, add a disabled-by-default feature flag that tells the RouteTable to use RouteReplace to clobber conflicting routes. (This was my first attempt at ta fix but it has its own problems.)

Also noticed that KeepVXLANDeviceInSync could take a long time to respond to changes, add a kick channel.

Move feature gates to the feature detector. Avoid needing to pass yet another object to 30 places.
  • Loading branch information
fasaxc committed Jan 26, 2024
1 parent 4fea4fa commit d3513a8
Show file tree
Hide file tree
Showing 13 changed files with 461 additions and 115 deletions.
29 changes: 22 additions & 7 deletions felix/dataplane/linux/int_dataplane.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
package intdataplane

import (
"context"
"fmt"
"net"
"os"
Expand Down Expand Up @@ -286,6 +287,11 @@ type InternalDataplane struct {

ipipManager *ipipManager

vxlanManager *vxlanManager
vxlanParentC chan string
vxlanManagerV6 *vxlanManager
vxlanParentCV6 chan string

wireguardManager *wireguardManager
wireguardManagerV6 *wireguardManager

Expand Down Expand Up @@ -380,7 +386,10 @@ func NewIntDataplaneDriver(config Config) *InternalDataplane {
log.WithError(err).Error("Failed to write MTU file, pod MTU may not be properly set")
}

featureDetector := environment.NewFeatureDetector(config.FeatureDetectOverrides)
featureDetector := environment.NewFeatureDetector(
config.FeatureDetectOverrides,
environment.WithFeatureGates(config.FeatureGates),
)
dp := &InternalDataplane{
toDataplane: make(chan interface{}, msgPeekLimit),
fromDataplane: make(chan interface{}, 100),
Expand Down Expand Up @@ -505,7 +514,7 @@ func NewIntDataplaneDriver(config Config) *InternalDataplane {
routeTableVXLAN = &routetable.DummyTable{}
}

vxlanManager := newVXLANManager(
dp.vxlanManager = newVXLANManager(
ipSetsV4,
routeTableVXLAN,
"vxlan.calico",
Expand All @@ -514,8 +523,9 @@ func NewIntDataplaneDriver(config Config) *InternalDataplane {
4,
featureDetector,
)
go vxlanManager.KeepVXLANDeviceInSync(config.VXLANMTU, dataplaneFeatures.ChecksumOffloadBroken, 10*time.Second)
dp.RegisterManager(vxlanManager)
dp.vxlanParentC = make(chan string, 1)
go dp.vxlanManager.KeepVXLANDeviceInSync(context.Background(), config.VXLANMTU, dataplaneFeatures.ChecksumOffloadBroken, 10*time.Second, dp.vxlanParentC)
dp.RegisterManager(dp.vxlanManager)
} else {
// Start a cleanup goroutine not to block felix if it needs to retry
go cleanUpVXLANDevice("vxlan.calico")
Expand Down Expand Up @@ -962,7 +972,7 @@ func NewIntDataplaneDriver(config Config) *InternalDataplane {
routeTableVXLANV6 = &routetable.DummyTable{}
}

vxlanManagerV6 := newVXLANManager(
dp.vxlanManagerV6 = newVXLANManager(
ipSetsV6,
routeTableVXLANV6,
"vxlan-v6.calico",
Expand All @@ -971,8 +981,9 @@ func NewIntDataplaneDriver(config Config) *InternalDataplane {
6,
featureDetector,
)
go vxlanManagerV6.KeepVXLANDeviceInSync(config.VXLANMTUV6, dataplaneFeatures.ChecksumOffloadBroken, 10*time.Second)
dp.RegisterManager(vxlanManagerV6)
dp.vxlanParentCV6 = make(chan string, 1)
go dp.vxlanManagerV6.KeepVXLANDeviceInSync(context.Background(), config.VXLANMTUV6, dataplaneFeatures.ChecksumOffloadBroken, 10*time.Second, dp.vxlanParentCV6)
dp.RegisterManager(dp.vxlanManagerV6)
} else {
// Start a cleanup goroutine not to block felix if it needs to retry
go cleanUpVXLANDevice("vxlan-v6.calico")
Expand Down Expand Up @@ -1848,6 +1859,10 @@ func (d *InternalDataplane) loopUpdatingDataplane() {
d.onDatastoreMessage(msg)
case ifaceUpdate := <-d.ifaceUpdates:
d.onIfaceMonitorMessage(ifaceUpdate)
case name := <-d.vxlanParentC:
d.vxlanManager.OnParentNameUpdate(name)
case name := <-d.vxlanParentCV6:
d.vxlanManagerV6.OnParentNameUpdate(name)
case <-ipSetsRefreshC:
log.Debug("Refreshing IP sets state")
d.forceIPSetsRefresh = true
Expand Down
Loading

0 comments on commit d3513a8

Please sign in to comment.