Skip to content

Commit

Permalink
bpf: egressgw: sync logic to determine if destination is outside cluster
Browse files Browse the repository at this point in the history
In the context of egress gateway, when traffic is leaving the cluster we
need to check twice if it is a match for an egress NAT policy:

* first time in handle_ipv4_from_lxc(), on the node where the client pod is
  running (to determine if it should be forwarded to a gateway node)
* second time in snat_v4_needed(), on the actual gateway node (to
  determine if it should be SNATed)

Currently the 2 checks are slightly diverging wrt how traffic destined
to outside the cluster is identified:

* in the first case we use is_cluster_destination(), which uses the
  information stored on the ipcache and EP maps
* in the second case we just rely on the IPV4_SNAT_EXCLUSION_DST_CIDR

The issue with the IPV4_SNAT_EXCLUSION_DST_CIDR logic is that we may
incorrectly exclude from egress gw SNAT traffic that is supposed to be
SNATed: case in point an EKS environment where the primary VPC is shared
between the cluster and some other EC2 nodes that don't belong to the
cluster.

To fix this, this commit changes the snat_v4_needed() logic to match the
one we use in handle_ipv4_from_lxc() and executes it before the
IPV4_SNAT_EXCLUSION_DST_CIDR check.

Signed-off-by: Gilberto Bertin <gilberto@isovalent.com>
  • Loading branch information
jibi authored and christarazi committed Jan 5, 2022
1 parent e3dca63 commit cdfc30d
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 55 deletions.
5 changes: 5 additions & 0 deletions bpf/bpf_lxc.c
Original file line number Diff line number Diff line change
Expand Up @@ -794,6 +794,11 @@ static __always_inline int handle_ipv4_from_lxc(struct __ctx_buff *ctx,
struct egress_gw_policy_entry *egress_gw_policy;
struct endpoint_key key = {};

/* If the packet is destined to an entity inside the cluster,
* either EP or node, it should not be forwarded to an egress
* gateway since only traffic leaving the cluster is supposed to
* be masqueraded with an egress IP.
*/
if (is_cluster_destination(ip4, *dst_id, tunnel_endpoint))
goto skip_egress_gateway;

Expand Down
116 changes: 61 additions & 55 deletions bpf/lib/nodeport.h
Original file line number Diff line number Diff line change
Expand Up @@ -1061,6 +1061,8 @@ static __always_inline bool snat_v4_needed(struct __ctx_buff *ctx, __be32 *addr,
struct iphdr *ip4;
struct endpoint_info *local_ep __maybe_unused;
struct remote_endpoint_info *remote_ep __maybe_unused;
struct egress_gw_policy_entry *egress_gw_policy __maybe_unused;
bool is_reply = false;

if (!revalidate_data(ctx, &data, &data_end, &ip4))
return false;
Expand Down Expand Up @@ -1092,6 +1094,26 @@ static __always_inline bool snat_v4_needed(struct __ctx_buff *ctx, __be32 *addr,
# endif
#endif /* defined(TUNNEL_MODE) && defined(IS_BPF_OVERLAY) */

local_ep = __lookup_ip4_endpoint(ip4->saddr);
remote_ep = lookup_ip4_remote_endpoint(ip4->daddr);

/* Check if this packet belongs to reply traffic coming from a
* local endpoint.
*
* If local_ep is NULL, it means there's no endpoint running on the
* node which matches the packet source IP, which means we can
* skip the CT lookup since this cannot be reply traffic.
*/
if (local_ep) {
struct ipv4_ct_tuple tuple = {
.nexthdr = ip4->protocol,
.daddr = ip4->daddr,
.saddr = ip4->saddr
};

ct_is_reply4(get_ct_map4(&tuple), ctx, ETH_HLEN +
ipv4_hdrlen(ip4), &tuple, &is_reply);
}

#ifdef ENABLE_MASQUERADE /* SNAT local pod to world packets */
# ifdef IS_BPF_OVERLAY
Expand All @@ -1101,6 +1123,41 @@ static __always_inline bool snat_v4_needed(struct __ctx_buff *ctx, __be32 *addr,
*/
return false;
# endif

/* Check if the packet matches an egress NAT policy and so needs to be SNAT'ed.
*
* This check must happen before the IPV4_SNAT_EXCLUSION_DST_CIDR check below as
* the destination may be in the SNAT exclusion CIDR but regardless of that we
* always want to SNAT a packet if it's matched by an egress NAT policy.
*/
#if defined(ENABLE_EGRESS_GATEWAY)
/* If the packet is destined to an entity inside the cluster, either EP
* or node, skip SNAT since only traffic leaving the cluster is supposed
* to be masqueraded with an egress IP.
*/
if (!local_ep || (remote_ep &&
is_cluster_destination(ip4, remote_ep->sec_label,
remote_ep->tunnel_endpoint)))
goto skip_egress_gateway;

/* If the packet is a reply it means that outside has initiated the
* connection, so no need to SNAT the reply.
*/
if (is_reply)
goto skip_egress_gateway;

egress_gw_policy = lookup_ip4_egress_gw_policy(ip4->saddr, ip4->daddr);
if (!egress_gw_policy)
goto skip_egress_gateway;

*addr = egress_gw_policy->egress_ip;
*from_endpoint = true;

return true;

skip_egress_gateway:
#endif

#ifdef IPV4_SNAT_EXCLUSION_DST_CIDR
/* Do not MASQ if a dst IP belongs to a pods CIDR
* (ipv4-native-routing-cidr if specified, otherwise local pod CIDR).
Expand All @@ -1113,12 +1170,10 @@ static __always_inline bool snat_v4_needed(struct __ctx_buff *ctx, __be32 *addr,
return false;
#endif

local_ep = __lookup_ip4_endpoint(ip4->saddr);
/* if this is a localhost endpoint, no SNAT is needed */
if (local_ep && (local_ep->flags & ENDPOINT_F_HOST))
return false;

remote_ep = lookup_ip4_remote_endpoint(ip4->daddr);
if (remote_ep) {
#ifdef ENABLE_IP_MASQ_AGENT
/* Do not SNAT if dst belongs to any ip-masq-agent
Expand All @@ -1145,60 +1200,11 @@ static __always_inline bool snat_v4_needed(struct __ctx_buff *ctx, __be32 *addr,
return false;
#endif

/* Check if this packet belongs to reply traffic coming from a
* local endpoint.
*
* If local_ep is NULL, it means there's no endpoint running on
* the node which matches the packet source IP, which means we
* can skip the CT lookup since this cannot be reply traffic.
*/
if (local_ep) {
bool is_reply = false;
struct ipv4_ct_tuple tuple = {
.nexthdr = ip4->protocol,
.daddr = ip4->daddr,
.saddr = ip4->saddr
};

/* If the packet is a reply it means that outside has
* initiated the connection, so no need to SNAT the
* reply.
*/
if (!ct_is_reply4(get_ct_map4(&tuple), ctx, ETH_HLEN + ipv4_hdrlen(ip4),
&tuple, &is_reply) && is_reply)
return false;
}

#if defined(ENABLE_EGRESS_GATEWAY)
/* Check egress gateway policy only for traffic which matches
* one of the following conditions.
* - Not from a local endpoint (inc. local host): that tells us
* the traffic was redirected by an egress gateway policy to
* this node to be masqueraded.
* - Not destined for a remote node: that tells us the traffic
* is leaving the cluster. Inter-node traffic to remote pods
* would either leave through the tunnel or match the above
* IPV4_SNAT_EXCLUSION_DST_CIDR check.
/* If the packet is a reply it means that outside has
* initiated the connection, so no need to SNAT the
* reply.
*/
if (!local_ep || !identity_is_remote_node(remote_ep->sec_label)) {
struct egress_gw_policy_entry *egress_gw_policy;

/* Check if SNAT needs to be applied to the packet.
* Apply SNAT if there is an egress rule in ebpf map,
* and the packet is not coming out from overlay
* interface. If the packet is coming from an overlay
* interface it means it is forwarded to another node,
* instead of leaving the cluster.
*/
egress_gw_policy = lookup_ip4_egress_gw_policy(ip4->saddr, ip4->daddr);
if (egress_gw_policy) {
*addr = egress_gw_policy->egress_ip;
*from_endpoint = true;
return true;
}
}
#endif
if (local_ep) {
if (!is_reply && local_ep) {
*from_endpoint = true;
*addr = IPV4_MASQUERADE;
return true;
Expand Down

0 comments on commit cdfc30d

Please sign in to comment.