Skip to content

Commit

Permalink
bpf: nodeport: handle revDNAT for local backends at to-netdev/to-overlay
Browse files Browse the repository at this point in the history
Replies by local service backends currently get their revDNAT processing in
from-container. There's two problems with that:
1. we have cases where the packet doesn't reach this part of the
   from-container program (ie. redirect to host proxy), and
2. we're not handling replies by hostns backends at all.

So add an additional check for RevDNAT in handle_nat_fwd() at to-netdev,
before such an untranslated reply leaves the node. The same is needed for
to-overlay, see
3a83623 ("bpf: add support for local NodePort via tunnel").

Fixes: cilium#22659
Fixes: cilium#22838
Fixes: cilium#21955
Signed-off-by: Julian Wiedmann <jwi@isovalent.com>
  • Loading branch information
julianwiedmann authored and ldelossa committed Jan 19, 2023
1 parent a12b9ec commit 54a8631
Show file tree
Hide file tree
Showing 4 changed files with 193 additions and 13 deletions.
5 changes: 1 addition & 4 deletions bpf/bpf_host.c
Original file line number Diff line number Diff line change
Expand Up @@ -1194,10 +1194,7 @@ int cil_to_netdev(struct __ctx_buff *ctx __maybe_unused)
METRIC_EGRESS);
#endif /* ENABLE_SRV6 */

#if defined(ENABLE_NODEPORT) && \
(!defined(ENABLE_DSR) || \
(defined(ENABLE_DSR) && defined(ENABLE_DSR_HYBRID)) || \
defined(ENABLE_MASQUERADE))
#ifdef ENABLE_NODEPORT
if (!ctx_snat_done(ctx)) {
/*
* handle_nat_fwd tail calls in the majority of cases,
Expand Down
4 changes: 3 additions & 1 deletion bpf/lib/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,9 @@ enum {
#define CILIUM_CALL_SRV6_REPLY 35
#define CILIUM_CALL_IPV4_NODEPORT_NAT_INGRESS 36
#define CILIUM_CALL_IPV6_NODEPORT_NAT_INGRESS 37
#define CILIUM_CALL_SIZE 38
#define CILIUM_CALL_IPV4_NODEPORT_SNAT_FWD 38
#define CILIUM_CALL_IPV6_NODEPORT_SNAT_FWD 39
#define CILIUM_CALL_SIZE 40

typedef __u64 mac_t;

Expand Down
191 changes: 185 additions & 6 deletions bpf/lib/nodeport.h
Original file line number Diff line number Diff line change
Expand Up @@ -979,6 +979,48 @@ static __always_inline int nodeport_lb6(struct __ctx_buff *ctx,
return DROP_MISSED_TAIL_CALL;
}

static __always_inline int
nodeport_rev_dnat_fwd_ipv6(struct __ctx_buff *ctx, struct trace_ctx *trace)
{
struct ipv6_ct_tuple tuple = {};
struct ct_state ct_state = {};
void *data, *data_end;
struct ipv6hdr *ip6;
int ret, l4_off;

if (!revalidate_data(ctx, &data, &data_end, &ip6))
return DROP_INVALID;

ret = lb6_extract_tuple(ctx, ip6, &l4_off, &tuple);
if (ret < 0) {
if (ret == DROP_NO_SERVICE || ret == DROP_UNKNOWN_L4)
return CTX_ACT_OK;
return ret;
}

if (!ct_has_nodeport_egress_entry6(get_ct_map6(&tuple), &tuple))
return CTX_ACT_OK;

ret = ct_lookup6(get_ct_map6(&tuple), &tuple, ctx, l4_off, CT_INGRESS,
&ct_state, &trace->monitor);

if (ret == CT_REPLY && ct_state.node_port && ct_state.rev_nat_index) {
struct csum_offset csum_off = {};

trace->reason = TRACE_REASON_CT_REPLY;
csum_l4_offset_and_flags(tuple.nexthdr, &csum_off);

ret = lb6_rev_nat(ctx, l4_off, &csum_off, ct_state.rev_nat_index,
&tuple, REV_NAT_F_TUPLE_SADDR);
if (IS_ERR(ret))
return ret;

ctx_snat_done_set(ctx);
}

return CTX_ACT_OK;
}

static __always_inline int rev_nodeport_lb6(struct __ctx_buff *ctx, __u32 *ifindex,
int *ext_err)
{
Expand Down Expand Up @@ -1168,17 +1210,56 @@ int tail_rev_nodeport_lb6(struct __ctx_buff *ctx)
return send_drop_notify_error_ext(ctx, 0, ret, ext_err, CTX_ACT_DROP, METRIC_EGRESS);
}

static __always_inline int handle_nat_fwd_ipv6(struct __ctx_buff *ctx)
__section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV6_NODEPORT_SNAT_FWD)
int tail_handle_snat_fwd_ipv6(struct __ctx_buff *ctx)
{
enum trace_point obs_point;
int ret;
#if defined(TUNNEL_MODE) && defined(IS_BPF_OVERLAY)
union v6addr addr = { .p1 = 0 };

obs_point = TRACE_TO_OVERLAY;
BPF_V6(addr, ROUTER_IP);
#else
union v6addr addr = IPV6_DIRECT_ROUTING;

obs_point = TRACE_TO_NETWORK;
#endif

return nodeport_snat_fwd_ipv6(ctx, &addr);
ret = nodeport_snat_fwd_ipv6(ctx, &addr);
if (IS_ERR(ret))
return send_drop_notify_error(ctx, 0, ret, CTX_ACT_DROP, METRIC_EGRESS);

send_trace_notify(ctx, obs_point, 0, 0, 0, 0, TRACE_REASON_UNKNOWN, 0);

return ret;
}

static __always_inline int
__handle_nat_fwd_ipv6(struct __ctx_buff *ctx, struct trace_ctx *trace)
{
int ret;

ret = nodeport_rev_dnat_fwd_ipv6(ctx, trace);
if (IS_ERR(ret))
return ret;

#if !defined(ENABLE_DSR) || \
(defined(ENABLE_DSR) && defined(ENABLE_DSR_HYBRID))
if (!ctx_snat_done(ctx)) {
ep_tail_call(ctx, CILIUM_CALL_IPV6_NODEPORT_SNAT_FWD);
ret = DROP_MISSED_TAIL_CALL;
}
#endif

return ret;
}

static __always_inline int handle_nat_fwd_ipv6(struct __ctx_buff *ctx)
{
struct trace_ctx trace;

return __handle_nat_fwd_ipv6(ctx, &trace);
}

declare_tailcall_if(__or(__and(is_defined(ENABLE_IPV4),
Expand All @@ -1188,6 +1269,10 @@ declare_tailcall_if(__or(__and(is_defined(ENABLE_IPV4),
CILIUM_CALL_IPV6_NODEPORT_NAT_FWD)
int tail_handle_nat_fwd_ipv6(struct __ctx_buff *ctx)
{
struct trace_ctx trace = {
.reason = TRACE_REASON_UNKNOWN,
.monitor = TRACE_PAYLOAD_LEN,
};
int ret;
enum trace_point obs_point;

Expand All @@ -1201,7 +1286,8 @@ int tail_handle_nat_fwd_ipv6(struct __ctx_buff *ctx)
if (IS_ERR(ret))
return send_drop_notify_error(ctx, 0, ret, CTX_ACT_DROP, METRIC_EGRESS);

send_trace_notify(ctx, obs_point, 0, 0, 0, 0, TRACE_REASON_UNKNOWN, 0);
send_trace_notify(ctx, obs_point, 0, 0, 0, 0, trace.reason,
trace.monitor);

return ret;
}
Expand Down Expand Up @@ -1969,6 +2055,50 @@ static __always_inline int nodeport_lb4(struct __ctx_buff *ctx,
return DROP_MISSED_TAIL_CALL;
}

static __always_inline int
nodeport_rev_dnat_fwd_ipv4(struct __ctx_buff *ctx, struct trace_ctx *trace)
{
int ret, l3_off = ETH_HLEN, l4_off;
struct ipv4_ct_tuple tuple = {};
struct ct_state ct_state = {};
void *data, *data_end;
struct iphdr *ip4;

if (!revalidate_data(ctx, &data, &data_end, &ip4))
return DROP_INVALID;

ret = lb4_extract_tuple(ctx, ip4, &l4_off, &tuple);
if (ret < 0) {
/* If it's not a SVC protocol, we don't need to check for RevDNAT: */
if (ret == DROP_NO_SERVICE || ret == DROP_UNKNOWN_L4)
return CTX_ACT_OK;
return ret;
}

if (!ct_has_nodeport_egress_entry4(get_ct_map4(&tuple), &tuple))
return CTX_ACT_OK;

ret = ct_lookup4(get_ct_map4(&tuple), &tuple, ctx, l4_off, CT_INGRESS,
&ct_state, &trace->monitor);

if (ret == CT_REPLY && ct_state.node_port && ct_state.rev_nat_index) {
struct csum_offset csum_off = {};

trace->reason = TRACE_REASON_CT_REPLY;
csum_l4_offset_and_flags(tuple.nexthdr, &csum_off);

ret = lb4_rev_nat(ctx, l3_off, l4_off, &csum_off, &ct_state,
&tuple, REV_NAT_F_TUPLE_SADDR,
ipv4_has_l4_header(ip4));
if (IS_ERR(ret))
return ret;

ctx_snat_done_set(ctx);
}

return CTX_ACT_OK;
}

/* Reverse NAT handling of node-port traffic for the case where the
* backend i) was a local EP and bpf_lxc redirected to us, ii) was
* a remote backend and we got here after reverse SNAT from the
Expand Down Expand Up @@ -2167,9 +2297,53 @@ int tail_rev_nodeport_lb4(struct __ctx_buff *ctx)
return ret;
}

__section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV4_NODEPORT_SNAT_FWD)
int tail_handle_snat_fwd_ipv4(struct __ctx_buff *ctx)
{
enum trace_point obs_point;
int ret;

#if defined(TUNNEL_MODE) && defined(IS_BPF_OVERLAY)
obs_point = TRACE_TO_OVERLAY;
#else
obs_point = TRACE_TO_NETWORK;
#endif

ret = nodeport_snat_fwd_ipv4(ctx);
if (IS_ERR(ret))
return send_drop_notify_error(ctx, 0, ret, CTX_ACT_DROP, METRIC_EGRESS);

send_trace_notify(ctx, obs_point, 0, 0, 0, 0, TRACE_REASON_UNKNOWN, 0);

return ret;
}

static __always_inline int
__handle_nat_fwd_ipv4(struct __ctx_buff *ctx, struct trace_ctx *trace)
{
int ret;

ret = nodeport_rev_dnat_fwd_ipv4(ctx, trace);
if (IS_ERR(ret))
return ret;

#if !defined(ENABLE_DSR) || \
(defined(ENABLE_DSR) && defined(ENABLE_DSR_HYBRID)) || \
defined(ENABLE_MASQUERADE)
if (!ctx_snat_done(ctx)) {
ep_tail_call(ctx, CILIUM_CALL_IPV4_NODEPORT_SNAT_FWD);
ret = DROP_MISSED_TAIL_CALL;
}
#endif

return ret;
}

static __always_inline int handle_nat_fwd_ipv4(struct __ctx_buff *ctx)
{
return nodeport_snat_fwd_ipv4(ctx);
struct trace_ctx trace;

return __handle_nat_fwd_ipv4(ctx, &trace);
}

declare_tailcall_if(__or3(__and(is_defined(ENABLE_IPV4),
Expand All @@ -2180,6 +2354,10 @@ declare_tailcall_if(__or3(__and(is_defined(ENABLE_IPV4),
CILIUM_CALL_IPV4_NODEPORT_NAT_FWD)
int tail_handle_nat_fwd_ipv4(struct __ctx_buff *ctx)
{
struct trace_ctx trace = {
.reason = TRACE_REASON_UNKNOWN,
.monitor = TRACE_PAYLOAD_LEN,
};
int ret;
enum trace_point obs_point;

Expand All @@ -2189,11 +2367,12 @@ int tail_handle_nat_fwd_ipv4(struct __ctx_buff *ctx)
obs_point = TRACE_TO_NETWORK;
#endif

ret = handle_nat_fwd_ipv4(ctx);
ret = __handle_nat_fwd_ipv4(ctx, &trace);
if (IS_ERR(ret))
return send_drop_notify_error(ctx, 0, ret, CTX_ACT_DROP, METRIC_EGRESS);

send_trace_notify(ctx, obs_point, 0, 0, 0, 0, TRACE_REASON_UNKNOWN, 0);
send_trace_notify(ctx, obs_point, 0, 0, 0, 0, trace.reason,
trace.monitor);

return ret;
}
Expand Down
6 changes: 4 additions & 2 deletions test/bpf/check-complexity.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,12 @@ function annotate_section_names {
-e "s/\(section '2\/34'\)/\1 (tail_call SRV6_DECAP)/" \
-e "s/\(section '2\/35'\)/\1 (tail_call SRV6_REPLY)/" \
-e "s/\(section '2\/36'\)/\1 (tail_call IPV4_NODEPORT_NAT_INGRESS)/" \
-e "s/\(section '2\/37'\)/\1 (tail_call IPV6_NODEPORT_NAT_INGRESS)/"
-e "s/\(section '2\/37'\)/\1 (tail_call IPV6_NODEPORT_NAT_INGRESS)/" \
-e "s/\(section '2\/38'\)/\1 (tail_call IPV4_NODEPORT_SNAT_FWD)/" \
-e "s/\(section '2\/39'\)/\1 (tail_call IPV6_NODEPORT_SNAT_FWD)/"
}

if ! grep -q "CILIUM_CALL_SIZE.*38" "$BPFDIR/lib/common.h" ; then
if ! grep -q "CILIUM_CALL_SIZE.*40" "$BPFDIR/lib/common.h" ; then
echo "This script is out of date compared to CILIUM_CALL_SIZE." 1>&2
exit 1
fi
Expand Down

0 comments on commit 54a8631

Please sign in to comment.