sapcc · BugRoger · Mar 26, 2018 · Feb 25, 2018
@@ -6,6 +6,7 @@ package extra_dependencies
 import (
 	_ "github.com/stretchr/testify/assert"
 	_ "github.com/stretchr/testify/require"
+	_ "github.com/stretchr/testify/mock"
 	_ "k8s.io/client-go/kubernetes/fake"
 	_ "k8s.io/code-generator/cmd/client-gen"
 	_ "k8s.io/code-generator/cmd/informer-gen"

@@ -41,6 +41,216 @@ The bug occasionally leaves static route entries in the cluster's OpenStack rout
 
 The bug is fixed in the upcoming 1.10 version of kubernetes. That means this controller is only needed for clusters <1.10.
 
-For each cluster the controller polls the corresponding OpenStack router and inspects the configured static routes. It tries to identify and remove orphaned routes to fix the clusters networking. It removes all route entries for CIDR ranges that are within the clusters CIDR range for Pods where the target/nextHop ip address can't be matched to an OpenStack compute instance.
+For each cluster the controller polls the corresponding OpenStack router and
+inspects the configured static routes. It tries to identify and remove orphaned
+routes to fix the clusters networking. It removes all route entries for CIDR
+ranges that are within the clusters CIDR range for Pods where the
+target/nextHop ip address can't be matched to an OpenStack compute instance.
 
 
+Flight Controller
+-----------------
+
+This controller takes care about Kluster health. It looks for obvious problems
+and tries to repair them.
+
+### Security Group (DVS) Update Latency
+
+Updates to the security group are not instant. There is a non-trivial amount of
+latency involved. This leads to timeouts and edge cases as the definition of
+tolerable latency differs. Most notable this affects DHCP and Ignition. Default
+tolerances are in the range of 30s.
+
+The latency of updates is related to general load on the regions, "noisy"
+neighbors blocking the update loops, amount of ports in the same project and
+Neutron and VCenter performance.
+
+If the updates take longer than these timeouts the following symptoms appear:
+
+ * Nodes Running but never become healthy
+ * No IPv4 Address visible on the VNC Console
+ * No SSH Login Possible
+ * Kubelet not running
+
+These symptom indicates that the node couldn't configure its network
+interface before the Ignition timeout. This effectifly leaves the node broken.
+
+Possible Workarounds:
+
+ * Increase DHCP/Ignition Timeout
+	 This configuration needs to be baked into the image as an OEM customization.
+	 It also interacts with the DHCP client timeout which again requires a
+	 modification of the image. With frequent CoreOS updates this modification
+	 needs to be automatic and included in the image build pipeline.
+
+ * Reboot the Instance
+	 This is the preferred workaround. It gives the DVS agents additional time to
+	 configure the existing image and retries the Ignition run (to be verified).
+
+ * Delete the Instance
+	 This workaround is problematic. It will not succeed if the update latency is
+	 too high in general.
+
+
+### Security Group Update Event Missed
+
+If an instance successfully downloads and startes the Kubelet, it registers itself
+with the APIServer and gets a PodCIDR range assigned. This triggers a
+reconfiguration of the Neutron Router and adds a static route. The route points
+the PodCIDR to the node's IP address. This is required to satisfy the Kubernetes
+pod to pod communication requirements.
+
+As the PodCIDR subnet is now actually routed via the Neutron router it is required
+to be allowed in the security group.
+
+This happens by updating the node's Neutron port and adding the required CIDR to
+`allowed_address_pairs`. This triggers an event that the port was updated. The DVS
+agent are catching this update and adding an additional rule to the security group.
+
+Occasionally, this update is missed. Until a full reconcilation loop happens
+(usually by restarting or update of the DVS agents) the following symptoms appear:
+
+ * Sporadic Pod Communication
+ * Sporadic Service Communication
+ * Sporadic Load Balancer Commnication
+ * Sporadic DNS Problems in Pods
+	 Depending on the disconnected node pods can't reach the Kube-DNS service. DNS
+	 will work on the nodes.
+ * Load Balancer Pools Flapping
+
+Possible Workarounds:
+
+ * Add PodCIDRs to Security Group
+	 Instead of relying on the unreliable Oslo events, all possible PodCIDRs are
+	 being added to the kluster's security group. Per default this is 198.19.0.0/16
+
+ * Trigger Security Group Sync
+	 Instead of waiting for a security group reconcilliation force an update by
+	 periodially add a (random) change to the security group. If possible this
+	 should only be triggered when a node condition indicates pod communication
+	 problems.
+
+
+###  Default Security Group Missing
+
+When a new node is created via Nova a security group can be specified. The user
+can select this security group during Kluster creation. If nothing is selected
+the `default` security group is assumed.
+
+For yet unknown reasons, there's a chance that the instance is configured without
+this security group association by Nova. In effect the instance is completely
+disconnected from the network.
+
+Symptoms are similar to (1):
+
+ * Nodes Running but never become healthy
+ * No IPv4 Address visible on the VNC Console
+ * No SSH Login Possible
+ * Kubelet not running
+
+Possible Workaround:
+
+ * Reconcile Security Group Associations
+	 Periodically check that instances which haven't registerd as nodes do have
+	 the required security group enabled. If not, set it again.
+
+
+### ASR Route Duplicates
+
+When a node is being deleted its route is removed in Neutron. The ASR agents
+get notified by an event and do remove the route from the ASR device.
+
+First of all, this requires that the state of the Neutron DB reflects reality.
+Updates to the routes are done by the RouteController in the Kubernetes OpenStack
+cloud provider. Before 1.10 there's a bug that misses the updates. In Kubernikus
+we fixed this by adding an additional RouteNanny for now.
+
+When a Kluster is terminated forcefully, the RouteController might be destroyed
+before it manages to update the Neutron database. The reconciliation happens
+every 60 seconds. We counter this by gracefully deorbiting the Kluster waiting
+for the updates either by the RouteController or the RouteNanny.
+
+Unfortunately, during normal operations by scaling node pools or terminating nodes
+updates to the routes do get missed as well. In that case the state in Neutron is
+correct, while the state on the ASR device still shows the deleted route. This
+should be fixable by triggering or manually running a re-sync. Unfortunately,
+that does not work.
+
+The re-sync mechanism between Neutron and ASR is not perfect. There is currently
+the problem that routes that have been removed in Neutron will not be removed
+during the sync. It only considers additional routes that have been added.
+
+The only way to recover this situation is to manually `delete` and then `sync` the
+router using the `asr1k_utils`. Additional node conditions that check for this
+problem will facilitate alerting and manual intervention.
+
+These dublicate routes are fatal because the IPAM module in the CNI plugin
+recycles PodCIDRs immediately. A new node will receive the PodCIDR of a previously
+existing node. The old node's routes are still configured on the ASR device and
+take precedence. That leaves the new node broken. For example, the state of the
+ASR routes after multiple node deletions:
+
+	198.19.1.0/24 -> 10.180.0.3
+	198.19.1.0/24 -> 10.180.0.4
+	198.19.1.0/24 -> 10.180.0.5
+
+Currently correct is the last route, pointing to the latest instance. In effect
+is the first route pointing to 10.180.0.3 which doesn't exist anymore.
+
+Symptoms:
+
+ * See (2). Sporadic Pod/Service/LB Communication Problems
+ * Only the first cluster in each project works
+
+Workarounds:
+
+ * None Known
+
+ * There's no known way to trigger a router `asr1k_utils delete` +
+	 `asr1k_utils sync` via OpenStack without actually deleting the whole Neutron
+	 router construct. If a side-channel could somehow be leveraged it would be
+	 possible to recover automatically.
+
+
+### ASR Missing Routes
+
+Due to various effects it is possible that the ASR agents miss the event to add
+an additional route when a new node is created.
+
+On specifically fatal effect is the failover between `ACTIVE` and `STANDBY`
+router. It seems to be a rather common defect (potentially even intended) that
+only the `ACTIVE` receives sync events. Upon failover routes are missing or
+reflect the state of a previous cluster.
+
+Symptoms:
+
+ * See (2)
+
+Workarounds:
+
+ * Manual Sync
+
+ * Trigger Sync automatically
+	 There's no direct interface to trigger a sync via OpenStack API. It can be
+	 forced indirectly by an action that triggers a sync: Attaching/Detaching a
+	 FIP/Interface, Adding/Removing a Route
+
+
+### Neutron Static Route Limit
+
+There's a static limit of 31 routes in Neutron.
+
+In projects with frequent Kluster create and deletes, the route limit can be
+exceeded due duplicate routes as described in (4).
+
+Symptoms:
+
+ * See (2). Pod/Service/LB communication problems
+ * 409 Conflict Errors in RouteController and RouteNanny
+ * Klusters have a max size of 31 Nodes
+
+Workarounds:
+
+ * None. Neutron needs to be reconfigured
+ * Clean up duplicate routes
+
@@ -4,6 +4,7 @@ import:
   version: ^1.1.0
   subpackages:
   - assert
+  - mock
 - package: k8s.io/client-go
   version: v6.0.0
 - package: k8s.io/apimachinery
@@ -27,7 +28,7 @@ import:
   - log
 # overwrite gophercloud version given in k8s deps
 - package: github.com/gophercloud/gophercloud
-  version: fe864ba585e153c296567b9ab4bbb1345d05900a
+  version: c7ca48da8eafab13e1835090a1147e64cfc10174
 # Dependencies extracted from go-swagger 0.13.0
 - package: github.com/go-openapi/errors
   version: 03cfca65330da08a5a440053faf994a3c682b5bf

@@ -5,7 +5,10 @@ import (
 	"time"
 
 	"github.com/gophercloud/gophercloud"
+	"github.com/gophercloud/gophercloud/openstack/compute/v2/extensions/secgroups"
 	"github.com/gophercloud/gophercloud/openstack/compute/v2/servers"
+	securitygroups "github.com/gophercloud/gophercloud/openstack/networking/v2/extensions/security/groups"
+	"github.com/gophercloud/gophercloud/openstack/networking/v2/extensions/security/rules"
 	"github.com/gophercloud/gophercloud/pagination"
 	"k8s.io/client-go/tools/cache"
 
@@ -18,6 +21,8 @@ type KlusterClient interface {
 	CreateNode(*models.NodePool, []byte) (string, error)
 	DeleteNode(string) error
 	ListNodes(*models.NodePool) ([]Node, error)
+	SetSecurityGroup(nodeID string) error
+	EnsureKubernikusRuleInSecurityGroup() (bool, error)
 }
 
 type klusterClient struct {
@@ -117,6 +122,95 @@ func (c *klusterClient) ListNodes(pool *models.NodePool) (nodes []Node, err erro
 	return nodes, nil
 }
 
+func (c *klusterClient) SetSecurityGroup(nodeID string) (err error) {
+	return secgroups.AddServer(c.ComputeClient, nodeID, c.Kluster.Spec.Openstack.SecurityGroupName).ExtractErr()
+}
+
+func (c *klusterClient) EnsureKubernikusRuleInSecurityGroup() (created bool, err error) {
+	page, err := securitygroups.List(c.NetworkClient, securitygroups.ListOpts{Name: c.Kluster.Spec.Openstack.SecurityGroupName}).AllPages()
+	if err != nil {
+		return false, fmt.Errorf("SecurityGroup %v not found: %s", c.Kluster.Spec.Openstack.SecurityGroupName, err)
+	}
+
+	groups, err := securitygroups.ExtractGroups(page)
+	if err != nil {
+		return false, err
+	}
+
+	if len(groups) != 1 {
+		return false, fmt.Errorf("More than one SecurityGroup with name %v found", c.Kluster.Spec.Openstack.SecurityGroupName)
+	}
+
+	udp := false
+	tcp := false
+	icmp := false
+	for _, rule := range groups[0].Rules {
+		if rule.Direction != string(rules.DirIngress) {
+			continue
+		}
+
+		if rule.EtherType != string(rules.EtherType4) {
+			continue
+		}
+
+		if rule.RemoteIPPrefix != c.Kluster.Spec.ClusterCIDR {
+			continue
+		}
+
+		if rule.Protocol == string(rules.ProtocolICMP) {
+			icmp = true
+			continue
+		}
+
+		if rule.Protocol == string(rules.ProtocolUDP) {
+			udp = true
+			continue
+		}
+
+		if rule.Protocol == string(rules.ProtocolTCP) {
+			tcp = true
+			continue
+		}
+
+		if icmp && udp && tcp {
+			break
+		}
+	}
+
+	opts := rules.CreateOpts{
+		Direction:      rules.DirIngress,
+		EtherType:      rules.EtherType4,
+		SecGroupID:     groups[0].ID,
+		RemoteIPPrefix: c.Kluster.Spec.ClusterCIDR,
+	}
+
+	if !udp {
+		opts.Protocol = rules.ProtocolUDP
+		_, err := rules.Create(c.NetworkClient, opts).Extract()
+		if err != nil {
+			return false, err
+		}
+	}
+
+	if !tcp {
+		opts.Protocol = rules.ProtocolTCP
+		_, err := rules.Create(c.NetworkClient, opts).Extract()
+		if err != nil {
+			return false, err
+		}
+	}
+
+	if !icmp {
+		opts.Protocol = rules.ProtocolICMP
+		_, err := rules.Create(c.NetworkClient, opts).Extract()
+		if err != nil {
+			return false, err
+		}
+	}
+
+	return !udp || !tcp || !icmp, nil
+}
+
 func ExtractServers(r pagination.Page) ([]Node, error) {
 	var s []Node
 	err := servers.ExtractServersInto(r, &s)