Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ICMP errors generated by tracked flows treated as related traffic #2247

Merged
merged 21 commits into from Apr 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
b63ddac
bpf: ct_make_key() to make keys CT easily
tomastigera Mar 27, 2020
ee18268
bpf: icmp related lookup for non-NATed traffic
tomastigera Mar 28, 2020
a684d88
bpf: icmp_skb_get_hdr() returns the icmp header
tomastigera Mar 31, 2020
1c8652b
bpf: icmp related for NAT
tomastigera Mar 31, 2020
c49f205
bpf: ut for NAT related from the host
tomastigera Mar 31, 2020
290081d
fv: allow creating inactive workloads
tomastigera Apr 1, 2020
57cf27a
fv: allow to start inactive workload
tomastigera Apr 1, 2020
910c0f6
fv/bpf: addWorkload returns the workload
tomastigera Apr 1, 2020
7fc04a8
bpf/fv: fix the IP size in makeICMPError() and test ports
tomastigera Apr 2, 2020
b483d5e
fv/bpf: check icmp related outer IP header
tomastigera Apr 2, 2020
cbf1998
bpf: tunneling from host and csum of icmp related
tomastigera Apr 2, 2020
8739e6f
fv: tcpdump fails test if it never listened
tomastigera Apr 16, 2020
43c8738
fv: tcpdump for containers without tcpdump installed
tomastigera Apr 16, 2020
5dc2d48
bpf: nodeports and icmp related
tomastigera Apr 3, 2020
65314c5
bpf: update conntrack vars only when icmp related success
tomastigera Apr 3, 2020
e4759c1
bpf: SNAT of outer IP only if returning to outer client
tomastigera Apr 3, 2020
14b7721
fv: icmp related workload-workload
tomastigera Apr 15, 2020
5c6a24e
bpf: fix comment in handling ttl
tomastigera Apr 16, 2020
7cbe463
fv: icmp related workload-workload through service
tomastigera Apr 17, 2020
aa28d96
bpf: icmp related retunign from tunnel is fwd approved
tomastigera Apr 17, 2020
0364cf6
bpf: fixed comment in icmp related SNAT
tomastigera Apr 20, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
201 changes: 138 additions & 63 deletions bpf-gpl/conntrack.h
Expand Up @@ -21,6 +21,7 @@
#include <linux/in.h>
#include "nat.h"
#include "bpf.h"
#include "icmp.h"

// Connection tracking.

Expand All @@ -30,6 +31,23 @@ struct calico_ct_key {
uint16_t port_a, port_b; // HBO
};

#define src_lt_dest(ip_src, ip_dst, sport, dport) \
((ip_src) < (ip_dst)) || (((ip_src) == (ip_dst)) && (sport) < (dport))

#define __ct_make_key(proto, ipa, ipb, porta, portb) \
(struct calico_ct_key) { \
.protocol = proto, \
.addr_a = ipa, .port_a = porta, \
.addr_b = ipb, .port_b = portb, \
}

#define ct_make_key(sltd, p, ipa, ipb, pta, ptb) ({ \
struct calico_ct_key k; \
k = sltd ? __ct_make_key(p, ipa, ipb, pta, ptb) : __ct_make_key(p, ipb, ipa, ptb, pta); \
dump_ct_key(&k); \
k; \
})

enum cali_ct_type {
CALI_CT_TYPE_NORMAL = 0x00, /* Non-NATted entry. */
CALI_CT_TYPE_NAT_FWD = 0x01, /* Forward entry for a DNATted flow, keyed on orig src/dst.
Expand Down Expand Up @@ -148,21 +166,8 @@ static CALI_BPF_INLINE int calico_ct_v4_create_tracking(struct ct_ctx *ctx,
*/
CALI_DEBUG("CT-ALL Asked to create entry but packet is marked as "
"from another endpoint, doing lookup\n");
bool srcLTDest = (ip_src < ip_dst) || ((ip_src == ip_dst) && sport < dport);
if (srcLTDest) {
*k = (struct calico_ct_key) {
.protocol = ctx->proto,
.addr_a = ip_src, .port_a = sport,
.addr_b = ip_dst, .port_b = dport,
};
} else {
*k = (struct calico_ct_key) {
.protocol = ctx->proto,
.addr_a = ip_dst, .port_a = dport,
.addr_b = ip_src, .port_b = sport,
};
}
dump_ct_key(k);
bool srcLTDest = src_lt_dest(ip_src, ip_dst, sport, dport);
*k = ct_make_key(srcLTDest, ctx->proto, ip_src, ip_dst, sport, dport);
struct calico_ct_value *ct_value = bpf_map_lookup_elem(&cali_v4_ct, k);
if (!ct_value) {
CALI_VERB("CT Packet marked as from workload but got a conntrack miss!\n");
Expand Down Expand Up @@ -327,6 +332,14 @@ enum calico_ct_result_type {
CALI_CT_INVALID,
};

#define CALI_CT_RELATED (1 << 8)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i decided to pass the flag as part of the code (a) the result struct is part of the state and I did not want to change it and (b) it effectively creates a whole set of return values, which was another option, but they would only differ in related or not


#define ct_result_rc(rc) ((rc) & 0xff)
#define ct_result_flags(rc) ((rc) & ~0xff)
#define ct_result_set_flag(val, flags) ((val) |= (flags))

#define ct_result_is_related(rc) ((rc) & CALI_CT_RELATED)

struct calico_ct_result {
__s16 rc;
__u16 flags;
Expand All @@ -335,36 +348,68 @@ struct calico_ct_result {
__be32 tun_ret_ip;
};

/* skb_is_icmp_err_unpack fills in ctx, but only what needs to be changed. For instance, keeps the
* cxt->skb or ctx->nat_tun_src. It returns true if the original packet is an icmp error and all
* checks went well.
*/
static CALI_BPF_INLINE bool skb_is_icmp_err_unpack(struct __sk_buff *skb, struct ct_ctx *ctx)
{
struct iphdr *ip;
struct icmphdr *icmp;

if (!icmp_skb_get_hdr(skb, &icmp)) {
CALI_DEBUG("CT-ICMP: failed to get inner IP\n");
return false;
}

if (!icmp_type_is_err(icmp->type)) {
CALI_DEBUG("CT-ICMP: type %d not an error\n", icmp->type);
return false;
}

ip = (struct iphdr *)(icmp + 1); /* skip to inner ip */
CALI_DEBUG("CT-ICMP: proto %d\n", ip->protocol);

ctx->proto = ip->protocol;
ctx->src = ip->saddr;
ctx->dst = ip->daddr;

switch (ip->protocol) {
case IPPROTO_TCP:
{
struct tcphdr *tcp = (struct tcphdr *)(ip + 1);
ctx->sport = be16_to_host(tcp->source);
ctx->dport = be16_to_host(tcp->dest);
ctx->tcp = tcp;
}
break;
case IPPROTO_UDP:
{
struct udphdr *udp = (struct udphdr *)(ip + 1);
ctx->sport = be16_to_host(udp->source);
ctx->dport = be16_to_host(udp->dest);
}
break;
};

return true;
}

static CALI_BPF_INLINE void calico_ct_v4_tcp_delete(
__be32 ip_src, __be32 ip_dst, __u16 sport, __u16 dport)
{
CALI_DEBUG("CT-TCP delete from %x:%d\n", be32_to_host(ip_src), sport);
CALI_DEBUG("CT-TCP delete to %x:%d\n", be32_to_host(ip_dst), dport);

bool srcLTDest = (ip_src < ip_dst) || ((ip_src == ip_dst) && sport < dport);
struct calico_ct_key k;
if (srcLTDest) {
k = (struct calico_ct_key) {
.protocol = IPPROTO_TCP,
.addr_a = ip_src, .port_a = sport,
.addr_b = ip_dst, .port_b = dport,
};
} else {
k = (struct calico_ct_key) {
.protocol = IPPROTO_TCP,
.addr_a = ip_dst, .port_a = dport,
.addr_b = ip_src, .port_b = sport,
};
}

dump_ct_key(&k);
bool srcLTDest = src_lt_dest(ip_src, ip_dst, sport, dport);
struct calico_ct_key k = ct_make_key(srcLTDest, IPPROTO_TCP, ip_src, ip_dst, sport, dport);

int rc = bpf_map_delete_elem(&cali_v4_ct, &k);
CALI_DEBUG("CT-TCP delete result: %d\n", rc);
}

#define CALI_CT_LOG(level, fmt, ...) \
CALI_LOG_IF_FLAG(level, CALI_COMPILE_FLAGS, "CT-%d "fmt, proto, ## __VA_ARGS__)
CALI_LOG_IF_FLAG(level, CALI_COMPILE_FLAGS, "CT-%d "fmt, proto_orig, ## __VA_ARGS__)
#define CALI_CT_DEBUG(fmt, ...) \
CALI_CT_LOG(CALI_LOG_LEVEL_DEBUG, fmt, ## __VA_ARGS__)
#define CALI_CT_VERB(fmt, ...) \
Expand All @@ -376,7 +421,7 @@ static CALI_BPF_INLINE void ct_tcp_entry_update(struct tcphdr *tcp_header,
struct calico_ct_leg *src_to_dst,
struct calico_ct_leg *dst_to_src)
{
__u8 proto = IPPROTO_TCP; /* used by logging */
__u8 proto_orig = IPPROTO_TCP; /* used by logging */

if (tcp_header->rst) {
CALI_CT_DEBUG("RST seen, marking CT entry.\n");
Expand Down Expand Up @@ -425,15 +470,15 @@ static CALI_BPF_INLINE void ct_tcp_entry_update(struct tcphdr *tcp_header,
}
}


static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct ct_ctx *ctx)
{
__u8 proto = ctx->proto;
__u8 proto_orig = ctx->proto;
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ctx->proto changes once we decode the icmp payload

__be32 ip_src = ctx->src;
__be32 ip_dst = ctx->dst;
__u16 sport = ctx->sport;
__u16 dport = ctx->dport;
struct tcphdr *tcp_header = ctx->tcp;
bool related = false;

CALI_CT_DEBUG("lookup from %x:%d\n", be32_to_host(ip_src), sport);
CALI_CT_DEBUG("lookup to %x:%d\n", be32_to_host(ip_dst), dport);
Expand All @@ -454,27 +499,38 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct ct_ctx
goto out_lookup_fail;
}

bool srcLTDest = (ip_src < ip_dst) || ((ip_src == ip_dst) && sport < dport);
struct calico_ct_key k;
if (srcLTDest) {
k = (struct calico_ct_key) {
.protocol = proto,
.addr_a = ip_src, .port_a = sport,
.addr_b = ip_dst, .port_b = dport,
};
} else {
k = (struct calico_ct_key) {
.protocol = proto,
.addr_a = ip_dst, .port_a = dport,
.addr_b = ip_src, .port_b = sport,
};
}
dump_ct_key(&k);
bool srcLTDest = src_lt_dest(ip_src, ip_dst, sport, dport);
struct calico_ct_key k = ct_make_key(srcLTDest, ctx->proto, ip_src, ip_dst, sport, dport);

struct calico_ct_value *v = bpf_map_lookup_elem(&cali_v4_ct, &k);
if (!v) {
CALI_CT_DEBUG("Miss.\n");
goto out_lookup_fail;
if (ctx->proto != IPPROTO_ICMP) {
CALI_CT_DEBUG("Miss.\n");
goto out_lookup_fail;
}
if (!skb_is_icmp_err_unpack(ctx->skb, ctx)) {
CALI_CT_DEBUG("unrelated icmp\n");
goto out_lookup_fail;
}

CALI_CT_DEBUG("related lookup from %x:%d\n", be32_to_host(ctx->src), ctx->sport);
CALI_CT_DEBUG("related lookup to %x:%d\n", be32_to_host(ctx->dst), ctx->dport);

srcLTDest = src_lt_dest(ctx->src, ctx->dst, ctx->sport, ctx->dport);
k = ct_make_key(srcLTDest, ctx->proto, ctx->src, ctx->dst, ctx->sport, ctx->dport);
v = bpf_map_lookup_elem(&cali_v4_ct, &k);
if (!v) {
CALI_CT_DEBUG("Miss on ICMP related\n");
goto out_lookup_fail;
}

ip_src = ctx->src;
ip_dst = ctx->dst;
sport = ctx->sport;
dport = ctx->dport;
tcp_header = ctx->tcp;

related = true;
}

__u64 now = bpf_ktime_get_ns();
Expand Down Expand Up @@ -514,7 +570,7 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct ct_ctx
result.tun_ret_ip = tracking_v->tun_ip;
CALI_CT_DEBUG("fwd tun_ip:%x\n", be32_to_host(tracking_v->tun_ip));

if (proto == IPPROTO_ICMP) {
if (ctx->proto == IPPROTO_ICMP) {
result.rc = CALI_CT_ESTABLISHED_DNAT;
result.nat_ip = tracking_v->orig_ip;
} else if (CALI_F_TO_HOST) {
Expand All @@ -536,9 +592,13 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct ct_ctx
dst_to_src = &v->a_to_b;
}

if (proto == IPPROTO_ICMP) {
result.tun_ret_ip = v->tun_ip;
CALI_CT_DEBUG("tun_ip:%x\n", be32_to_host(v->tun_ip));

if (ctx->proto == IPPROTO_ICMP || (related && proto_orig == IPPROTO_ICMP)) {
result.rc = CALI_CT_ESTABLISHED_SNAT;
result.nat_ip = v->orig_ip;
result.nat_port = v->orig_port;
break;
}

Expand Down Expand Up @@ -568,14 +628,10 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct ct_ctx
CALI_CT_DEBUG("Hit! NAT REV entry but not connection opener: ESTABLISHED.\n");
result.rc = CALI_CT_ESTABLISHED;
}
result.tun_ret_ip = v->tun_ip;
CALI_CT_DEBUG("tun_ip:%x\n", be32_to_host(v->tun_ip));
break;

case CALI_CT_TYPE_NORMAL:
if (v->type == CALI_CT_TYPE_NORMAL) {
CALI_CT_DEBUG("Hit! NORMAL entry.\n");
}
CALI_CT_DEBUG("Hit! NORMAL entry.\n");
CALI_CT_VERB("Created: %llu.\n", v->created);
if (tcp_header) {
CALI_CT_VERB("Last seen: %llu.\n", v->last_seen);
Expand Down Expand Up @@ -621,6 +677,19 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct ct_ctx
src_to_dst->whitelisted &&
!result.tun_ret_ip;

if (related) {
if (proto_orig == IPPROTO_ICMP) {
/* flip src/dst as ICMP related carries the original ip/l4 headers in
* opposite direction - it is a reaction on the original packet.
*/
struct calico_ct_leg *tmp;

tmp = src_to_dst;
src_to_dst = dst_to_src;
dst_to_src = tmp;
}
}

if (CALI_F_TO_HOST && !ctx->nat_tun_src) {
/* Source of the packet is the endpoint, so check the src whitelist. */
if (src_to_dst->whitelisted) {
Expand Down Expand Up @@ -652,7 +721,7 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct ct_ctx
}
}

if (tcp_header) {
if (tcp_header && !related) {
if (ret_from_tun) {
/* we returned from tunnel, we are after SNAT, unlike
* with NAT on workload, we hit FWD entry in both
Expand All @@ -666,7 +735,13 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct ct_ctx
ct_tcp_entry_update(tcp_header, src_to_dst, dst_to_src);
}

CALI_CT_DEBUG("result: %d.\n", result.rc);
CALI_CT_DEBUG("result: %d\n", result.rc);

if (related) {
ct_result_set_flag(result.rc, CALI_CT_RELATED);
CALI_CT_DEBUG("result: related\n");
}

return result;

out_lookup_fail:
Expand Down
40 changes: 40 additions & 0 deletions bpf-gpl/icmp.h
Expand Up @@ -193,4 +193,44 @@ static CALI_BPF_INLINE int icmp_v4_ttl_exceeded(struct __sk_buff *skb)
return icmp_v4_reply(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
}

static CALI_BPF_INLINE bool icmp_type_is_err(__u8 type)
{
switch (type) {
case ICMP_DEST_UNREACH:
case ICMP_SOURCE_QUENCH:
case ICMP_REDIRECT:
case ICMP_TIME_EXCEEDED:
case ICMP_PARAMETERPROB:
return true;
}

return false;
}

static CALI_BPF_INLINE bool icmp_skb_get_hdr(struct __sk_buff *skb, struct icmphdr **icmp)
{
struct iphdr *ip;
long ip_off;
int minsz;

ip_off = skb_iphdr_offset(skb);
minsz = ip_off + sizeof(struct iphdr) + sizeof(struct icmphdr) + sizeof(struct iphdr) + 8;

if (skb_shorter(skb, minsz)) {
CALI_DEBUG("ICMP: %d shorter than %d\n", skb_len_dir_access(skb), minsz);
return false;
}

ip = skb_iphdr(skb);

if (ip->ihl != 5) {
CALI_INFO("ICMP: ip options unsupported\n");
return false;
}

*icmp = (struct icmphdr *)(ip + 1);

return true;
}

#endif /* __CALI_ICMP_H__ */
4 changes: 4 additions & 0 deletions bpf-gpl/skb.h
Expand Up @@ -35,6 +35,10 @@
#define skb_ptr(skb, off) ((void *)((long)(skb)->data + (off)))
#define skb_ptr_after(skb, ptr) ((void *)((ptr) + 1))

#define skb_len_dir_access(skb) skb_tail_len(skb, skb_start_ptr(skb))

#define skb_seen(skb) ((skb)->mark & CALI_SKB_MARK_SEEN)

#define IPV4_UDP_SIZE (sizeof(struct iphdr) + sizeof(struct udphdr))
#define ETH_IPV4_UDP_SIZE (sizeof(struct ethhdr) + IPV4_UDP_SIZE)

Expand Down