Skip to content

Commit

Permalink
Fix subctl diagnose hostname mismatch issue
Browse files Browse the repository at this point in the history
Submariner Endpoint stores the hostname info as part of the endpoint object.
In most of the K8s clusters, the hostname matches with the nodeName, but on
some clusters, it was seen that nodeName does not match. This PR fixes this
issue.

Also, when more than a single node is labelled as Gateway node, the current
code was not handling it properly, this PR fixes it.

Fixes issue: submariner-io#1471
Signed-Off-by: Sridhar Gaddam <sgaddam@redhat.com>
  • Loading branch information
sridhargaddam committed Jul 15, 2021
1 parent 4553bde commit e0ec5fb
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 56 deletions.
86 changes: 83 additions & 3 deletions pkg/subctl/cmd/diagnose/firewall.go
Expand Up @@ -18,7 +18,13 @@ limitations under the License.
package diagnose

import (
"context"

"github.com/spf13/cobra"
"github.com/submariner-io/submariner-operator/pkg/internal/cli"
"github.com/submariner-io/submariner-operator/pkg/subctl/cmd"
subv1 "github.com/submariner-io/submariner/pkg/apis/submariner.io/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"

"github.com/submariner-io/submariner-operator/pkg/subctl/resource"
Expand All @@ -32,10 +38,10 @@ var diagnoseFirewallConfigCmd = &cobra.Command{

var validationTimeout uint

func addDiagnoseFWConfigFlags(cmd *cobra.Command) {
cmd.Flags().UintVar(&validationTimeout, "validation-timeout", 90,
func addDiagnoseFWConfigFlags(command *cobra.Command) {
command.Flags().UintVar(&validationTimeout, "validation-timeout", 90,
"timeout in seconds while validating the connection attempt")
addNamespaceFlag(cmd)
addNamespaceFlag(command)
}

func init() {
Expand Down Expand Up @@ -74,3 +80,77 @@ func spawnPod(client kubernetes.Interface, scheduling resource.PodScheduling, po

return pod, nil
}

func getActiveGatewayNodeName(cluster *cmd.Cluster, hostname string, status *cli.Status) string {
nodes, err := cluster.KubeClient.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{
LabelSelector: "submariner.io/gateway=true",
})
if err != nil {
status.EndWithFailure("Error obtaining the Gateway Nodes in cluster %q: %v", cluster.Name, err)
return ""
}

for _, node := range nodes.Items {
if node.Name == hostname {
return hostname
}

// On some platforms, the nodeName does not match with the hostname.
// Submariner Endpoint stores the hostname info in the endpoint and not the nodeName. So, we spawn a
// tiny pod to read the hostname and return the corresponding node.
sPod, err := spawnSnifferPodOnNode(cluster.KubeClient, node.Name, "default", "hostname")
if err != nil {
status.EndWithFailure("Error spawning the sniffer pod on the node %q: %v", node.Name, err)
return ""
}

defer sPod.DeletePod()

if err = sPod.AwaitPodCompletion(); err != nil {
status.EndWithFailure("Error waiting for the sniffer pod to finish its execution on node %q: %v", node.Name, err)
return ""
}

if sPod.PodOutput[:len(sPod.PodOutput)-1] == hostname {
return node.Name
}
}

status.EndWithFailure("Could not find the active Gateway node %q in local cluster in cluster %q",
hostname, cluster.Name)
return ""
}

func getLocalEndpointResource(cluster *cmd.Cluster, status *cli.Status) *subv1.Endpoint {
endpoints, err := cluster.SubmClient.SubmarinerV1().Endpoints(cmd.OperatorNamespace).List(context.TODO(), metav1.ListOptions{})
if err != nil {
status.EndWithFailure("Error obtaining the Endpoints in cluster %q: %v", cluster.Name, err)
return nil
}

for _, endpoint := range endpoints.Items {
if endpoint.Spec.ClusterID == cluster.Name {
return &endpoint
}
}

status.EndWithFailure("Could not find the local Endpoint in cluster %q", cluster.Name)
return nil
}

func getAnyRemoteEndpointResource(cluster *cmd.Cluster, status *cli.Status) *subv1.Endpoint {
endpoints, err := cluster.SubmClient.SubmarinerV1().Endpoints(cmd.OperatorNamespace).List(context.TODO(), metav1.ListOptions{})
if err != nil {
status.EndWithFailure("Error obtaining the Endpoints in cluster %q: %v", cluster.Name, err)
return nil
}

for _, endpoint := range endpoints.Items {
if endpoint.Spec.ClusterID != cluster.Name {
return &endpoint
}
}

status.EndWithFailure("Could not find any remote Endpoint in cluster %q", cluster.Name)
return nil
}
19 changes: 9 additions & 10 deletions pkg/subctl/cmd/diagnose/firewall_vxlan.go
Expand Up @@ -76,32 +76,31 @@ func checkFWConfig(cluster *cmd.Cluster, status *cli.Status) {
return
}

gateways, err := cluster.GetGateways()
if err != nil {
status.QueueFailureMessage(fmt.Sprintf("Error retrieving Gateways: %v", err))
localEndpoint := getLocalEndpointResource(cluster, status)
if localEndpoint == nil {
return
}

if len(gateways.Items) == 0 {
status.QueueWarningMessage("There are no gateways detected")
remoteEndpoint := getAnyRemoteEndpointResource(cluster, status)
if remoteEndpoint == nil {
return
}

if len(gateways.Items[0].Status.Connections) == 0 {
status.QueueWarningMessage("There are no active connections to remote clusters")
gwNodeName := getActiveGatewayNodeName(cluster, localEndpoint.Spec.Hostname, status)
if gwNodeName == "" {
return
}

podCommand := fmt.Sprintf("timeout %d %s", validationTimeout, TCPSniffVxLANCommand)
sPod, err := spawnSnifferPodOnGatewayNode(cluster.KubeClient, podNamespace, podCommand)
sPod, err := spawnSnifferPodOnNode(cluster.KubeClient, gwNodeName, podNamespace, podCommand)
if err != nil {
status.QueueFailureMessage(fmt.Sprintf("Error spawning the sniffer pod on the Gateway node: %v", err))
status.EndWithFailure("Error spawning the sniffer pod on the Gateway node: %v", err)
return
}

defer sPod.DeletePod()

remoteClusterIP := strings.Split(gateways.Items[0].Status.Connections[0].Endpoint.Subnets[0], "/")[0]
remoteClusterIP := strings.Split(remoteEndpoint.Spec.Subnets[0], "/")[0]
podCommand = fmt.Sprintf("nc -w %d %s 8080", validationTimeout/2, remoteClusterIP)
cPod, err := spawnClientPodOnNonGatewayNode(cluster.KubeClient, podNamespace, podCommand)
if err != nil {
Expand Down
44 changes: 1 addition & 43 deletions pkg/subctl/cmd/diagnose/tunnel.go
Expand Up @@ -18,7 +18,6 @@ limitations under the License.
package diagnose

import (
"context"
"fmt"
"os"
"strings"
Expand All @@ -30,8 +29,6 @@ import (
"github.com/submariner-io/submariner-operator/pkg/subctl/cmd/utils"
"github.com/submariner-io/submariner-operator/pkg/subctl/cmd/utils/restconfig"
subv1 "github.com/submariner-io/submariner/pkg/apis/submariner.io/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/uuid"
"k8s.io/client-go/rest"
)
Expand Down Expand Up @@ -108,7 +105,7 @@ func validateTunnelConfigAcrossClusters(localCfg, remoteCfg *rest.Config) bool {
status := cli.NewStatus()
status.Start(fmt.Sprintf("Checking if tunnels can be setup on the gateway node of cluster %q", localCluster.Name))

localEndpoint := getEndpointResource(localCluster, status)
localEndpoint := getLocalEndpointResource(localCluster, status)
if localEndpoint == nil {
return false
}
Expand Down Expand Up @@ -180,45 +177,6 @@ func validateTunnelConfigAcrossClusters(localCfg, remoteCfg *rest.Config) bool {
return true
}

func getEndpointResource(cluster *cmd.Cluster, status *cli.Status) *subv1.Endpoint {
endpoints, err := cluster.SubmClient.SubmarinerV1().Endpoints(cmd.OperatorNamespace).List(context.TODO(), metav1.ListOptions{})
if err != nil {
status.EndWithFailure("Error obtaining the Endpoints in cluster %q: %v", cluster.Name, err)
return nil
}

for _, endpoint := range endpoints.Items {
if endpoint.Spec.ClusterID == cluster.Name {
return &endpoint
}
}

status.EndWithFailure("Could not find the local Endpoint in cluster %q", cluster.Name)
return nil
}

func getActiveGatewayNodeName(cluster *cmd.Cluster, hostname string, status *cli.Status) string {
nodes, err := cluster.KubeClient.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
if err != nil {
status.EndWithFailure("Error obtaining the Nodes in cluster %q: %v", cluster.Name, err)
return ""
}

for _, node := range nodes.Items {
for _, addr := range node.Status.Addresses {
if addr.Type == corev1.NodeHostName {
if strings.HasPrefix(addr.Address, hostname) {
return node.Name
}
}
}
}

status.EndWithFailure("Could not find the active Gateway node %q in local cluster in cluster %q",
hostname, cluster.Name)
return ""
}

func getTunnelPort(submariner *v1alpha1.Submariner, endpoint *subv1.Endpoint, status *cli.Status) (int32, bool) {
var tunnelPort int32
var err error
Expand Down

0 comments on commit e0ec5fb

Please sign in to comment.