From e0ec5fbb0b80bc33baaf21bba99b450875fab7d2 Mon Sep 17 00:00:00 2001 From: Sridhar Gaddam Date: Wed, 14 Jul 2021 11:41:17 +0530 Subject: [PATCH] Fix subctl diagnose hostname mismatch issue Submariner Endpoint stores the hostname info as part of the endpoint object. In most of the K8s clusters, the hostname matches with the nodeName, but on some clusters, it was seen that nodeName does not match. This PR fixes this issue. Also, when more than a single node is labelled as Gateway node, the current code was not handling it properly, this PR fixes it. Fixes issue: https://github.com/submariner-io/submariner-operator/issues/1471 Signed-Off-by: Sridhar Gaddam --- pkg/subctl/cmd/diagnose/firewall.go | 86 ++++++++++++++++++++++- pkg/subctl/cmd/diagnose/firewall_vxlan.go | 19 +++-- pkg/subctl/cmd/diagnose/tunnel.go | 44 +----------- 3 files changed, 93 insertions(+), 56 deletions(-) diff --git a/pkg/subctl/cmd/diagnose/firewall.go b/pkg/subctl/cmd/diagnose/firewall.go index 40f34a239..bf7b2c97c 100644 --- a/pkg/subctl/cmd/diagnose/firewall.go +++ b/pkg/subctl/cmd/diagnose/firewall.go @@ -18,7 +18,13 @@ limitations under the License. package diagnose import ( + "context" + "github.com/spf13/cobra" + "github.com/submariner-io/submariner-operator/pkg/internal/cli" + "github.com/submariner-io/submariner-operator/pkg/subctl/cmd" + subv1 "github.com/submariner-io/submariner/pkg/apis/submariner.io/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "github.com/submariner-io/submariner-operator/pkg/subctl/resource" @@ -32,10 +38,10 @@ var diagnoseFirewallConfigCmd = &cobra.Command{ var validationTimeout uint -func addDiagnoseFWConfigFlags(cmd *cobra.Command) { - cmd.Flags().UintVar(&validationTimeout, "validation-timeout", 90, +func addDiagnoseFWConfigFlags(command *cobra.Command) { + command.Flags().UintVar(&validationTimeout, "validation-timeout", 90, "timeout in seconds while validating the connection attempt") - addNamespaceFlag(cmd) + addNamespaceFlag(command) } func init() { @@ -74,3 +80,77 @@ func spawnPod(client kubernetes.Interface, scheduling resource.PodScheduling, po return pod, nil } + +func getActiveGatewayNodeName(cluster *cmd.Cluster, hostname string, status *cli.Status) string { + nodes, err := cluster.KubeClient.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{ + LabelSelector: "submariner.io/gateway=true", + }) + if err != nil { + status.EndWithFailure("Error obtaining the Gateway Nodes in cluster %q: %v", cluster.Name, err) + return "" + } + + for _, node := range nodes.Items { + if node.Name == hostname { + return hostname + } + + // On some platforms, the nodeName does not match with the hostname. + // Submariner Endpoint stores the hostname info in the endpoint and not the nodeName. So, we spawn a + // tiny pod to read the hostname and return the corresponding node. + sPod, err := spawnSnifferPodOnNode(cluster.KubeClient, node.Name, "default", "hostname") + if err != nil { + status.EndWithFailure("Error spawning the sniffer pod on the node %q: %v", node.Name, err) + return "" + } + + defer sPod.DeletePod() + + if err = sPod.AwaitPodCompletion(); err != nil { + status.EndWithFailure("Error waiting for the sniffer pod to finish its execution on node %q: %v", node.Name, err) + return "" + } + + if sPod.PodOutput[:len(sPod.PodOutput)-1] == hostname { + return node.Name + } + } + + status.EndWithFailure("Could not find the active Gateway node %q in local cluster in cluster %q", + hostname, cluster.Name) + return "" +} + +func getLocalEndpointResource(cluster *cmd.Cluster, status *cli.Status) *subv1.Endpoint { + endpoints, err := cluster.SubmClient.SubmarinerV1().Endpoints(cmd.OperatorNamespace).List(context.TODO(), metav1.ListOptions{}) + if err != nil { + status.EndWithFailure("Error obtaining the Endpoints in cluster %q: %v", cluster.Name, err) + return nil + } + + for _, endpoint := range endpoints.Items { + if endpoint.Spec.ClusterID == cluster.Name { + return &endpoint + } + } + + status.EndWithFailure("Could not find the local Endpoint in cluster %q", cluster.Name) + return nil +} + +func getAnyRemoteEndpointResource(cluster *cmd.Cluster, status *cli.Status) *subv1.Endpoint { + endpoints, err := cluster.SubmClient.SubmarinerV1().Endpoints(cmd.OperatorNamespace).List(context.TODO(), metav1.ListOptions{}) + if err != nil { + status.EndWithFailure("Error obtaining the Endpoints in cluster %q: %v", cluster.Name, err) + return nil + } + + for _, endpoint := range endpoints.Items { + if endpoint.Spec.ClusterID != cluster.Name { + return &endpoint + } + } + + status.EndWithFailure("Could not find any remote Endpoint in cluster %q", cluster.Name) + return nil +} diff --git a/pkg/subctl/cmd/diagnose/firewall_vxlan.go b/pkg/subctl/cmd/diagnose/firewall_vxlan.go index 0ce762807..3acade2e2 100644 --- a/pkg/subctl/cmd/diagnose/firewall_vxlan.go +++ b/pkg/subctl/cmd/diagnose/firewall_vxlan.go @@ -76,32 +76,31 @@ func checkFWConfig(cluster *cmd.Cluster, status *cli.Status) { return } - gateways, err := cluster.GetGateways() - if err != nil { - status.QueueFailureMessage(fmt.Sprintf("Error retrieving Gateways: %v", err)) + localEndpoint := getLocalEndpointResource(cluster, status) + if localEndpoint == nil { return } - if len(gateways.Items) == 0 { - status.QueueWarningMessage("There are no gateways detected") + remoteEndpoint := getAnyRemoteEndpointResource(cluster, status) + if remoteEndpoint == nil { return } - if len(gateways.Items[0].Status.Connections) == 0 { - status.QueueWarningMessage("There are no active connections to remote clusters") + gwNodeName := getActiveGatewayNodeName(cluster, localEndpoint.Spec.Hostname, status) + if gwNodeName == "" { return } podCommand := fmt.Sprintf("timeout %d %s", validationTimeout, TCPSniffVxLANCommand) - sPod, err := spawnSnifferPodOnGatewayNode(cluster.KubeClient, podNamespace, podCommand) + sPod, err := spawnSnifferPodOnNode(cluster.KubeClient, gwNodeName, podNamespace, podCommand) if err != nil { - status.QueueFailureMessage(fmt.Sprintf("Error spawning the sniffer pod on the Gateway node: %v", err)) + status.EndWithFailure("Error spawning the sniffer pod on the Gateway node: %v", err) return } defer sPod.DeletePod() - remoteClusterIP := strings.Split(gateways.Items[0].Status.Connections[0].Endpoint.Subnets[0], "/")[0] + remoteClusterIP := strings.Split(remoteEndpoint.Spec.Subnets[0], "/")[0] podCommand = fmt.Sprintf("nc -w %d %s 8080", validationTimeout/2, remoteClusterIP) cPod, err := spawnClientPodOnNonGatewayNode(cluster.KubeClient, podNamespace, podCommand) if err != nil { diff --git a/pkg/subctl/cmd/diagnose/tunnel.go b/pkg/subctl/cmd/diagnose/tunnel.go index 616d21ba0..4f79b4e36 100644 --- a/pkg/subctl/cmd/diagnose/tunnel.go +++ b/pkg/subctl/cmd/diagnose/tunnel.go @@ -18,7 +18,6 @@ limitations under the License. package diagnose import ( - "context" "fmt" "os" "strings" @@ -30,8 +29,6 @@ import ( "github.com/submariner-io/submariner-operator/pkg/subctl/cmd/utils" "github.com/submariner-io/submariner-operator/pkg/subctl/cmd/utils/restconfig" subv1 "github.com/submariner-io/submariner/pkg/apis/submariner.io/v1" - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/uuid" "k8s.io/client-go/rest" ) @@ -108,7 +105,7 @@ func validateTunnelConfigAcrossClusters(localCfg, remoteCfg *rest.Config) bool { status := cli.NewStatus() status.Start(fmt.Sprintf("Checking if tunnels can be setup on the gateway node of cluster %q", localCluster.Name)) - localEndpoint := getEndpointResource(localCluster, status) + localEndpoint := getLocalEndpointResource(localCluster, status) if localEndpoint == nil { return false } @@ -180,45 +177,6 @@ func validateTunnelConfigAcrossClusters(localCfg, remoteCfg *rest.Config) bool { return true } -func getEndpointResource(cluster *cmd.Cluster, status *cli.Status) *subv1.Endpoint { - endpoints, err := cluster.SubmClient.SubmarinerV1().Endpoints(cmd.OperatorNamespace).List(context.TODO(), metav1.ListOptions{}) - if err != nil { - status.EndWithFailure("Error obtaining the Endpoints in cluster %q: %v", cluster.Name, err) - return nil - } - - for _, endpoint := range endpoints.Items { - if endpoint.Spec.ClusterID == cluster.Name { - return &endpoint - } - } - - status.EndWithFailure("Could not find the local Endpoint in cluster %q", cluster.Name) - return nil -} - -func getActiveGatewayNodeName(cluster *cmd.Cluster, hostname string, status *cli.Status) string { - nodes, err := cluster.KubeClient.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) - if err != nil { - status.EndWithFailure("Error obtaining the Nodes in cluster %q: %v", cluster.Name, err) - return "" - } - - for _, node := range nodes.Items { - for _, addr := range node.Status.Addresses { - if addr.Type == corev1.NodeHostName { - if strings.HasPrefix(addr.Address, hostname) { - return node.Name - } - } - } - } - - status.EndWithFailure("Could not find the active Gateway node %q in local cluster in cluster %q", - hostname, cluster.Name) - return "" -} - func getTunnelPort(submariner *v1alpha1.Submariner, endpoint *subv1.Endpoint, status *cli.Status) (int32, bool) { var tunnelPort int32 var err error