From 4eba2768a582727c76eeecc0b5780c30713822b7 Mon Sep 17 00:00:00 2001 From: Neil Jerram Date: Tue, 21 Aug 2018 14:46:47 +0100 Subject: [PATCH 1/5] Clarify ifacemonitor_test logging --- ifacemonitor/ifacemonitor_test.go | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/ifacemonitor/ifacemonitor_test.go b/ifacemonitor/ifacemonitor_test.go index d99220d309..0b1d4a7858 100644 --- a/ifacemonitor/ifacemonitor_test.go +++ b/ifacemonitor/ifacemonitor_test.go @@ -69,6 +69,7 @@ type mockDataplane struct { } func (nl *netlinkTest) addLink(name string) { + log.WithFields(log.Fields{"name": name}).Info("ADDLINK") nl.linksMutex.Lock() if nl.links == nil { nl.links = map[string]linkModel{} @@ -85,6 +86,7 @@ func (nl *netlinkTest) addLink(name string) { } func (nl *netlinkTest) renameLink(oldName, newName string) { + log.WithFields(log.Fields{"oldName": oldName, "newName": newName}).Info("RENAMELINK") nl.linksMutex.Lock() link := nl.links[oldName] delete(nl.links, oldName) @@ -94,6 +96,7 @@ func (nl *netlinkTest) renameLink(oldName, newName string) { } func (nl *netlinkTest) changeLinkState(name string, state string) { + log.WithFields(log.Fields{"name": name, "state": state}).Info("CHANGELINKSTATE") nl.linksMutex.Lock() link := nl.links[name] link.state = state @@ -103,6 +106,7 @@ func (nl *netlinkTest) changeLinkState(name string, state string) { } func (nl *netlinkTest) delLink(name string) { + log.WithFields(log.Fields{"name": name}).Info("DELLINK") var oldIndex int nl.linksMutex.Lock() link, prs := nl.links[name] @@ -155,6 +159,7 @@ func (nl *netlinkTest) signalLink(name string, oldIndex int) { } func (nl *netlinkTest) addAddr(name string, addr string) { + log.WithFields(log.Fields{"name": name, "addr": addr}).Info("ADDADDR") nl.linksMutex.Lock() link := nl.links[name] link.addrs.Add(addr) @@ -164,6 +169,7 @@ func (nl *netlinkTest) addAddr(name string, addr string) { } func (nl *netlinkTest) delAddr(name string, addr string) { + log.WithFields(log.Fields{"name": name, "addr": addr}).Info("DELADDR") nl.linksMutex.Lock() link := nl.links[name] link.addrs.Discard(addr) @@ -255,8 +261,7 @@ func (nl *netlinkTest) AddrList(link netlink.Link, family int) ([]netlink.Addr, } func (dp *mockDataplane) linkStateCallback(ifaceName string, ifaceState ifacemonitor.State) { - log.Info("linkStateCallback: ifaceName=", ifaceName) - log.Info("linkStateCallback: ifaceState=", ifaceState) + log.WithFields(log.Fields{"name": ifaceName, "state": ifaceState}).Info("CALLBACK LINK") dp.linkC <- linkUpdate{ name: ifaceName, state: ifaceState, @@ -281,7 +286,7 @@ func (dp *mockDataplane) addrStateCallback(ifaceName string, addrs set.Set) { log.WithFields(log.Fields{ "ifaceName": ifaceName, "addrs": addrs, - }).Info("Address state updated") + }).Info("CALLBACK ADDR") dp.addrC <- addrState{ifaceName: ifaceName, addrs: addrs} log.Info("mock dataplane reported address callback") } From 62741b49e9335559f0b5392bf0f14d53bd303325 Mon Sep 17 00:00:00 2001 From: Neil Jerram Date: Wed, 22 Aug 2018 10:27:24 +0100 Subject: [PATCH 2/5] UT when resync spots that a link has been deleted A customer reported a calico/node having got into a state where expected iptables programming for a HostEndpoint was missing. They created a bridge device, and a HostEndpoint to protect that, and initially the expected iptables programming was present. However, after some period of churn, which we believe included - deleting and re-adding the bridge device - setting the state of the bridge device down and later up again - creating and deleting many real or simulated pods/containers on that node, it was observed that the HostEndpoint iptables were missing, and that setting the bridge device down and up made no difference to this. The missing iptables could only be recreated by restarting the calico/node. We identified a bug that would account for this if all of the following points are also true: 1. One time when the bridge device is deleted, the Netlink event for that is lost (or delayed for a long time), and the removal of the device is then detected by a period resync that the Felix code does. The bug is that the exact handling here is different than when processing an individual Netlink event. 2. When the bridge device is added again, it has the same addresses and interface index as before it was deleted, and either there is no Netlink event reporting the addresses, or Felix sees that event before it sees the event about the device reappearing. 3. Later, when the bridge device is set down and up again, either there are no address changes that ensue from that (which is unusual, because Linux devices normally do IPv6 autoconfiguration, which means that an IPv6 address is added when the device goes up, and taken away when the device goes down), or there is some other reason why Netlink address events are not generated, or don't reach Felix. This change adds a UT case to repro that scenario, and demonstrates the result that we are missing a non-nil address callback when the bridge device is added again. In Felix as a whole, the absence of that callback means that HostEndpoint iptables are not reprogrammed. --- ifacemonitor/ifacemonitor_test.go | 52 ++++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 5 deletions(-) diff --git a/ifacemonitor/ifacemonitor_test.go b/ifacemonitor/ifacemonitor_test.go index 0b1d4a7858..6a8fec1da7 100644 --- a/ifacemonitor/ifacemonitor_test.go +++ b/ifacemonitor/ifacemonitor_test.go @@ -69,6 +69,11 @@ type mockDataplane struct { } func (nl *netlinkTest) addLink(name string) { + nl.addLinkNoSignal(name) + nl.signalLink(name, 0) +} + +func (nl *netlinkTest) addLinkNoSignal(name string) { log.WithFields(log.Fields{"name": name}).Info("ADDLINK") nl.linksMutex.Lock() if nl.links == nil { @@ -82,7 +87,6 @@ func (nl *netlinkTest) addLink(name string) { } nl.nextIndex++ nl.linksMutex.Unlock() - nl.signalLink(name, 0) } func (nl *netlinkTest) renameLink(oldName, newName string) { @@ -106,8 +110,12 @@ func (nl *netlinkTest) changeLinkState(name string, state string) { } func (nl *netlinkTest) delLink(name string) { + oldIndex := nl.delLinkNoSignal(name) + nl.signalLink(name, oldIndex) +} + +func (nl *netlinkTest) delLinkNoSignal(name string) (oldIndex int) { log.WithFields(log.Fields{"name": name}).Info("DELLINK") - var oldIndex int nl.linksMutex.Lock() link, prs := nl.links[name] if prs { @@ -117,7 +125,7 @@ func (nl *netlinkTest) delLink(name string) { } delete(nl.links, name) nl.linksMutex.Unlock() - nl.signalLink(name, oldIndex) + return } func (nl *netlinkTest) signalLink(name string, oldIndex int) { @@ -172,8 +180,10 @@ func (nl *netlinkTest) delAddr(name string, addr string) { log.WithFields(log.Fields{"name": name, "addr": addr}).Info("DELADDR") nl.linksMutex.Lock() link := nl.links[name] - link.addrs.Discard(addr) - nl.links[name] = link + if link.addrs != nil { + link.addrs.Discard(addr) + nl.links[name] = link + } nl.linksMutex.Unlock() nl.signalAddr(name, addr, false) } @@ -487,4 +497,36 @@ var _ = Describe("ifacemonitor", func() { resyncC <- time.Time{} resyncC <- time.Time{} }) + + It("should handle link flap", func() { + // Add a link and an address. + nl.addLink("eth0") + resyncC <- time.Time{} + dp.expectAddrStateCb("eth0", "", true) + nl.addAddr("eth0", "10.0.240.10/24") + dp.expectAddrStateCb("eth0", "10.0.240.10", true) + + // Set the link up, and expect a link callback. Addresses are unchanged, so there + // is no address callback. + nl.changeLinkState("eth0", "up") + dp.expectLinkStateCb("eth0", ifacemonitor.StateUp) + + // Delete the link, and have that picked up by resync. For this scenario we have to + // assume that there is never any Netlink signal for the link deletion. + _ = nl.delLinkNoSignal("eth0") + resyncC <- time.Time{} + dp.expectLinkStateCb("eth0", ifacemonitor.StateDown) + dp.expectAddrStateCb("eth0", "", false) + + // Add the link again, with the same ifIndex, but hold the signal through Netlink. + nl.nextIndex-- + nl.addLinkNoSignal("eth0") + // Add the address, and let that go through Netlink. + nl.addAddr("eth0", "10.0.240.10/24") + // Now signal the link. + nl.signalLink("eth0", 0) + + // Now we should see an address callback again. + dp.expectAddrStateCb("eth0", "10.0.240.10", true) + }) }) From a538fc9484cf24a1562727b50ca649e131b75ea9 Mon Sep 17 00:00:00 2001 From: Neil Jerram Date: Wed, 22 Aug 2018 23:25:12 +0100 Subject: [PATCH 3/5] Clear caches when resync spots a link as having disappeared --- ifacemonitor/iface_monitor.go | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/ifacemonitor/iface_monitor.go b/ifacemonitor/iface_monitor.go index 9afe5974a8..1dc73061ac 100644 --- a/ifacemonitor/iface_monitor.go +++ b/ifacemonitor/iface_monitor.go @@ -52,7 +52,7 @@ type InterfaceMonitor struct { netlinkStub netlinkStub resyncC <-chan time.Time - upIfaces set.Set + upIfaces map[string]int Callback InterfaceStateCallback AddrCallback AddrStateCallback ifaceName map[int]string @@ -70,7 +70,7 @@ func NewWithStubs(config Config, netlinkStub netlinkStub, resyncC <-chan time.Ti Config: config, netlinkStub: netlinkStub, resyncC: resyncC, - upIfaces: set.New(), + upIfaces: map[string]int{}, ifaceName: map[int]string{}, ifaceAddrs: map[int]set.Set{}, } @@ -268,15 +268,15 @@ func (m *InterfaceMonitor) storeAndNotifyLinkInner(ifaceExists bool, ifaceName s // etc. rawFlags := attrs.RawFlags ifaceIsUp := ifaceExists && rawFlags&syscall.IFF_RUNNING != 0 - ifaceWasUp := m.upIfaces.Contains(ifaceName) + _, ifaceWasUp := m.upIfaces[ifaceName] logCxt := log.WithField("ifaceName", ifaceName) if ifaceIsUp && !ifaceWasUp { logCxt.Debug("Interface now up") - m.upIfaces.Add(ifaceName) + m.upIfaces[ifaceName] = ifIndex m.Callback(ifaceName, StateUp) } else if ifaceWasUp && !ifaceIsUp { logCxt.Debug("Interface now down") - m.upIfaces.Discard(ifaceName) + delete(m.upIfaces, ifaceName) m.Callback(ifaceName, StateDown) } else { logCxt.WithField("ifaceIsUp", ifaceIsUp).Debug("Nothing to notify") @@ -326,15 +326,17 @@ func (m *InterfaceMonitor) resync() error { currentIfaces.Add(attrs.Name) m.storeAndNotifyLink(true, link) } - m.upIfaces.Iter(func(name interface{}) error { + for name, ifIndex := range m.upIfaces { if currentIfaces.Contains(name) { - return nil + continue } log.WithField("ifaceName", name).Info("Spotted interface removal on resync.") - m.Callback(name.(string), StateDown) - m.AddrCallback(name.(string), nil) - return set.RemoveItem - }) + m.Callback(name, StateDown) + m.AddrCallback(name, nil) + delete(m.upIfaces, name) + delete(m.ifaceAddrs, ifIndex) + delete(m.ifaceName, ifIndex) + } log.Debug("Resync complete") return nil } From 0c1ce452e8358cd16271dba18023267d7c6d0dca Mon Sep 17 00:00:00 2001 From: Neil Jerram Date: Thu, 23 Aug 2018 09:33:30 +0100 Subject: [PATCH 4/5] Code review markups --- ifacemonitor/iface_monitor.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ifacemonitor/iface_monitor.go b/ifacemonitor/iface_monitor.go index 1dc73061ac..621233cb11 100644 --- a/ifacemonitor/iface_monitor.go +++ b/ifacemonitor/iface_monitor.go @@ -52,7 +52,7 @@ type InterfaceMonitor struct { netlinkStub netlinkStub resyncC <-chan time.Time - upIfaces map[string]int + upIfaces map[string]int // Map from interface name to index. Callback InterfaceStateCallback AddrCallback AddrStateCallback ifaceName map[int]string From 0c7a912dd3911404afdcc6f3deac1d86e1833971 Mon Sep 17 00:00:00 2001 From: Neil Jerram Date: Thu, 23 Aug 2018 11:40:57 +0100 Subject: [PATCH 5/5] Fix ifacemonitor UT flake The test code's intention is to test when a deleted link is detected in a resync. However there was a race, and the following was also possible: 1. Kick off a resync. 2. Kick off link deletion. 3. Resync queries links and finds that the link is question is still present. 4. Link deletion now proceeds, and state for that link deleted. 5. Resync queries addresses for that link and finds that there aren't any. resulting in an addr callback with set.New(), instead of an addr callback with nil. We can now use delLinkNoSignal to ensure that the original intended scenario reliably happens, so that's what this change does. --- ifacemonitor/ifacemonitor_test.go | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/ifacemonitor/ifacemonitor_test.go b/ifacemonitor/ifacemonitor_test.go index 6a8fec1da7..ad742adf57 100644 --- a/ifacemonitor/ifacemonitor_test.go +++ b/ifacemonitor/ifacemonitor_test.go @@ -447,12 +447,11 @@ var _ = Describe("ifacemonitor", func() { nl.changeLinkState("eth0", "up") dp.expectLinkStateCb("eth0", ifacemonitor.StateUp) - // Trigger a resync, then immediately delete the link. What happens is that the - // test code deletes its state for eth0 before the monitor's resync() calls - // LinkList, and so the monitor reports "Spotted interface removal on resync" and - // makes link and address callbacks accordingly. + // Test when a deleted link is detected in a resync. The monitor should report + // "Spotted interface removal on resync" and make link and address callbacks + // accordingly. + nl.delLinkNoSignal("eth0") resyncC <- time.Time{} - nl.delLink("eth0") dp.expectLinkStateCb("eth0", ifacemonitor.StateDown) dp.expectAddrStateCb("eth0", "", false)