Skip to content

Commit

Permalink
Merge pull request #2247 from tomastigera/tomas-icmp-related
Browse files Browse the repository at this point in the history
ICMP errors generated by tracked flows treated as related traffic
  • Loading branch information
fasaxc committed Apr 21, 2020
2 parents 72dbcac + 0364cf6 commit e3ece8a
Show file tree
Hide file tree
Showing 11 changed files with 944 additions and 179 deletions.
201 changes: 138 additions & 63 deletions bpf-gpl/conntrack.h
Expand Up @@ -21,6 +21,7 @@
#include <linux/in.h>
#include "nat.h"
#include "bpf.h"
#include "icmp.h"

// Connection tracking.

Expand All @@ -30,6 +31,23 @@ struct calico_ct_key {
uint16_t port_a, port_b; // HBO
};

#define src_lt_dest(ip_src, ip_dst, sport, dport) \
((ip_src) < (ip_dst)) || (((ip_src) == (ip_dst)) && (sport) < (dport))

#define __ct_make_key(proto, ipa, ipb, porta, portb) \
(struct calico_ct_key) { \
.protocol = proto, \
.addr_a = ipa, .port_a = porta, \
.addr_b = ipb, .port_b = portb, \
}

#define ct_make_key(sltd, p, ipa, ipb, pta, ptb) ({ \
struct calico_ct_key k; \
k = sltd ? __ct_make_key(p, ipa, ipb, pta, ptb) : __ct_make_key(p, ipb, ipa, ptb, pta); \
dump_ct_key(&k); \
k; \
})

enum cali_ct_type {
CALI_CT_TYPE_NORMAL = 0x00, /* Non-NATted entry. */
CALI_CT_TYPE_NAT_FWD = 0x01, /* Forward entry for a DNATted flow, keyed on orig src/dst.
Expand Down Expand Up @@ -148,21 +166,8 @@ static CALI_BPF_INLINE int calico_ct_v4_create_tracking(struct ct_ctx *ctx,
*/
CALI_DEBUG("CT-ALL Asked to create entry but packet is marked as "
"from another endpoint, doing lookup\n");
bool srcLTDest = (ip_src < ip_dst) || ((ip_src == ip_dst) && sport < dport);
if (srcLTDest) {
*k = (struct calico_ct_key) {
.protocol = ctx->proto,
.addr_a = ip_src, .port_a = sport,
.addr_b = ip_dst, .port_b = dport,
};
} else {
*k = (struct calico_ct_key) {
.protocol = ctx->proto,
.addr_a = ip_dst, .port_a = dport,
.addr_b = ip_src, .port_b = sport,
};
}
dump_ct_key(k);
bool srcLTDest = src_lt_dest(ip_src, ip_dst, sport, dport);
*k = ct_make_key(srcLTDest, ctx->proto, ip_src, ip_dst, sport, dport);
struct calico_ct_value *ct_value = bpf_map_lookup_elem(&cali_v4_ct, k);
if (!ct_value) {
CALI_VERB("CT Packet marked as from workload but got a conntrack miss!\n");
Expand Down Expand Up @@ -327,6 +332,14 @@ enum calico_ct_result_type {
CALI_CT_INVALID,
};

#define CALI_CT_RELATED (1 << 8)

#define ct_result_rc(rc) ((rc) & 0xff)
#define ct_result_flags(rc) ((rc) & ~0xff)
#define ct_result_set_flag(val, flags) ((val) |= (flags))

#define ct_result_is_related(rc) ((rc) & CALI_CT_RELATED)

struct calico_ct_result {
__s16 rc;
__u16 flags;
Expand All @@ -335,36 +348,68 @@ struct calico_ct_result {
__be32 tun_ret_ip;
};

/* skb_is_icmp_err_unpack fills in ctx, but only what needs to be changed. For instance, keeps the
* cxt->skb or ctx->nat_tun_src. It returns true if the original packet is an icmp error and all
* checks went well.
*/
static CALI_BPF_INLINE bool skb_is_icmp_err_unpack(struct __sk_buff *skb, struct ct_ctx *ctx)
{
struct iphdr *ip;
struct icmphdr *icmp;

if (!icmp_skb_get_hdr(skb, &icmp)) {
CALI_DEBUG("CT-ICMP: failed to get inner IP\n");
return false;
}

if (!icmp_type_is_err(icmp->type)) {
CALI_DEBUG("CT-ICMP: type %d not an error\n", icmp->type);
return false;
}

ip = (struct iphdr *)(icmp + 1); /* skip to inner ip */
CALI_DEBUG("CT-ICMP: proto %d\n", ip->protocol);

ctx->proto = ip->protocol;
ctx->src = ip->saddr;
ctx->dst = ip->daddr;

switch (ip->protocol) {
case IPPROTO_TCP:
{
struct tcphdr *tcp = (struct tcphdr *)(ip + 1);
ctx->sport = be16_to_host(tcp->source);
ctx->dport = be16_to_host(tcp->dest);
ctx->tcp = tcp;
}
break;
case IPPROTO_UDP:
{
struct udphdr *udp = (struct udphdr *)(ip + 1);
ctx->sport = be16_to_host(udp->source);
ctx->dport = be16_to_host(udp->dest);
}
break;
};

return true;
}

static CALI_BPF_INLINE void calico_ct_v4_tcp_delete(
__be32 ip_src, __be32 ip_dst, __u16 sport, __u16 dport)
{
CALI_DEBUG("CT-TCP delete from %x:%d\n", be32_to_host(ip_src), sport);
CALI_DEBUG("CT-TCP delete to %x:%d\n", be32_to_host(ip_dst), dport);

bool srcLTDest = (ip_src < ip_dst) || ((ip_src == ip_dst) && sport < dport);
struct calico_ct_key k;
if (srcLTDest) {
k = (struct calico_ct_key) {
.protocol = IPPROTO_TCP,
.addr_a = ip_src, .port_a = sport,
.addr_b = ip_dst, .port_b = dport,
};
} else {
k = (struct calico_ct_key) {
.protocol = IPPROTO_TCP,
.addr_a = ip_dst, .port_a = dport,
.addr_b = ip_src, .port_b = sport,
};
}

dump_ct_key(&k);
bool srcLTDest = src_lt_dest(ip_src, ip_dst, sport, dport);
struct calico_ct_key k = ct_make_key(srcLTDest, IPPROTO_TCP, ip_src, ip_dst, sport, dport);

int rc = bpf_map_delete_elem(&cali_v4_ct, &k);
CALI_DEBUG("CT-TCP delete result: %d\n", rc);
}

#define CALI_CT_LOG(level, fmt, ...) \
CALI_LOG_IF_FLAG(level, CALI_COMPILE_FLAGS, "CT-%d "fmt, proto, ## __VA_ARGS__)
CALI_LOG_IF_FLAG(level, CALI_COMPILE_FLAGS, "CT-%d "fmt, proto_orig, ## __VA_ARGS__)
#define CALI_CT_DEBUG(fmt, ...) \
CALI_CT_LOG(CALI_LOG_LEVEL_DEBUG, fmt, ## __VA_ARGS__)
#define CALI_CT_VERB(fmt, ...) \
Expand All @@ -376,7 +421,7 @@ static CALI_BPF_INLINE void ct_tcp_entry_update(struct tcphdr *tcp_header,
struct calico_ct_leg *src_to_dst,
struct calico_ct_leg *dst_to_src)
{
__u8 proto = IPPROTO_TCP; /* used by logging */
__u8 proto_orig = IPPROTO_TCP; /* used by logging */

if (tcp_header->rst) {
CALI_CT_DEBUG("RST seen, marking CT entry.\n");
Expand Down Expand Up @@ -425,15 +470,15 @@ static CALI_BPF_INLINE void ct_tcp_entry_update(struct tcphdr *tcp_header,
}
}


static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct ct_ctx *ctx)
{
__u8 proto = ctx->proto;
__u8 proto_orig = ctx->proto;
__be32 ip_src = ctx->src;
__be32 ip_dst = ctx->dst;
__u16 sport = ctx->sport;
__u16 dport = ctx->dport;
struct tcphdr *tcp_header = ctx->tcp;
bool related = false;

CALI_CT_DEBUG("lookup from %x:%d\n", be32_to_host(ip_src), sport);
CALI_CT_DEBUG("lookup to %x:%d\n", be32_to_host(ip_dst), dport);
Expand All @@ -454,27 +499,38 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct ct_ctx
goto out_lookup_fail;
}

bool srcLTDest = (ip_src < ip_dst) || ((ip_src == ip_dst) && sport < dport);
struct calico_ct_key k;
if (srcLTDest) {
k = (struct calico_ct_key) {
.protocol = proto,
.addr_a = ip_src, .port_a = sport,
.addr_b = ip_dst, .port_b = dport,
};
} else {
k = (struct calico_ct_key) {
.protocol = proto,
.addr_a = ip_dst, .port_a = dport,
.addr_b = ip_src, .port_b = sport,
};
}
dump_ct_key(&k);
bool srcLTDest = src_lt_dest(ip_src, ip_dst, sport, dport);
struct calico_ct_key k = ct_make_key(srcLTDest, ctx->proto, ip_src, ip_dst, sport, dport);

struct calico_ct_value *v = bpf_map_lookup_elem(&cali_v4_ct, &k);
if (!v) {
CALI_CT_DEBUG("Miss.\n");
goto out_lookup_fail;
if (ctx->proto != IPPROTO_ICMP) {
CALI_CT_DEBUG("Miss.\n");
goto out_lookup_fail;
}
if (!skb_is_icmp_err_unpack(ctx->skb, ctx)) {
CALI_CT_DEBUG("unrelated icmp\n");
goto out_lookup_fail;
}

CALI_CT_DEBUG("related lookup from %x:%d\n", be32_to_host(ctx->src), ctx->sport);
CALI_CT_DEBUG("related lookup to %x:%d\n", be32_to_host(ctx->dst), ctx->dport);

srcLTDest = src_lt_dest(ctx->src, ctx->dst, ctx->sport, ctx->dport);
k = ct_make_key(srcLTDest, ctx->proto, ctx->src, ctx->dst, ctx->sport, ctx->dport);
v = bpf_map_lookup_elem(&cali_v4_ct, &k);
if (!v) {
CALI_CT_DEBUG("Miss on ICMP related\n");
goto out_lookup_fail;
}

ip_src = ctx->src;
ip_dst = ctx->dst;
sport = ctx->sport;
dport = ctx->dport;
tcp_header = ctx->tcp;

related = true;
}

__u64 now = bpf_ktime_get_ns();
Expand Down Expand Up @@ -514,7 +570,7 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct ct_ctx
result.tun_ret_ip = tracking_v->tun_ip;
CALI_CT_DEBUG("fwd tun_ip:%x\n", be32_to_host(tracking_v->tun_ip));

if (proto == IPPROTO_ICMP) {
if (ctx->proto == IPPROTO_ICMP) {
result.rc = CALI_CT_ESTABLISHED_DNAT;
result.nat_ip = tracking_v->orig_ip;
} else if (CALI_F_TO_HOST) {
Expand All @@ -536,9 +592,13 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct ct_ctx
dst_to_src = &v->a_to_b;
}

if (proto == IPPROTO_ICMP) {
result.tun_ret_ip = v->tun_ip;
CALI_CT_DEBUG("tun_ip:%x\n", be32_to_host(v->tun_ip));

if (ctx->proto == IPPROTO_ICMP || (related && proto_orig == IPPROTO_ICMP)) {
result.rc = CALI_CT_ESTABLISHED_SNAT;
result.nat_ip = v->orig_ip;
result.nat_port = v->orig_port;
break;
}

Expand Down Expand Up @@ -568,14 +628,10 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct ct_ctx
CALI_CT_DEBUG("Hit! NAT REV entry but not connection opener: ESTABLISHED.\n");
result.rc = CALI_CT_ESTABLISHED;
}
result.tun_ret_ip = v->tun_ip;
CALI_CT_DEBUG("tun_ip:%x\n", be32_to_host(v->tun_ip));
break;

case CALI_CT_TYPE_NORMAL:
if (v->type == CALI_CT_TYPE_NORMAL) {
CALI_CT_DEBUG("Hit! NORMAL entry.\n");
}
CALI_CT_DEBUG("Hit! NORMAL entry.\n");
CALI_CT_VERB("Created: %llu.\n", v->created);
if (tcp_header) {
CALI_CT_VERB("Last seen: %llu.\n", v->last_seen);
Expand Down Expand Up @@ -621,6 +677,19 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct ct_ctx
src_to_dst->whitelisted &&
!result.tun_ret_ip;

if (related) {
if (proto_orig == IPPROTO_ICMP) {
/* flip src/dst as ICMP related carries the original ip/l4 headers in
* opposite direction - it is a reaction on the original packet.
*/
struct calico_ct_leg *tmp;

tmp = src_to_dst;
src_to_dst = dst_to_src;
dst_to_src = tmp;
}
}

if (CALI_F_TO_HOST && !ctx->nat_tun_src) {
/* Source of the packet is the endpoint, so check the src whitelist. */
if (src_to_dst->whitelisted) {
Expand Down Expand Up @@ -652,7 +721,7 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct ct_ctx
}
}

if (tcp_header) {
if (tcp_header && !related) {
if (ret_from_tun) {
/* we returned from tunnel, we are after SNAT, unlike
* with NAT on workload, we hit FWD entry in both
Expand All @@ -666,7 +735,13 @@ static CALI_BPF_INLINE struct calico_ct_result calico_ct_v4_lookup(struct ct_ctx
ct_tcp_entry_update(tcp_header, src_to_dst, dst_to_src);
}

CALI_CT_DEBUG("result: %d.\n", result.rc);
CALI_CT_DEBUG("result: %d\n", result.rc);

if (related) {
ct_result_set_flag(result.rc, CALI_CT_RELATED);
CALI_CT_DEBUG("result: related\n");
}

return result;

out_lookup_fail:
Expand Down
40 changes: 40 additions & 0 deletions bpf-gpl/icmp.h
Expand Up @@ -193,4 +193,44 @@ static CALI_BPF_INLINE int icmp_v4_ttl_exceeded(struct __sk_buff *skb)
return icmp_v4_reply(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
}

static CALI_BPF_INLINE bool icmp_type_is_err(__u8 type)
{
switch (type) {
case ICMP_DEST_UNREACH:
case ICMP_SOURCE_QUENCH:
case ICMP_REDIRECT:
case ICMP_TIME_EXCEEDED:
case ICMP_PARAMETERPROB:
return true;
}

return false;
}

static CALI_BPF_INLINE bool icmp_skb_get_hdr(struct __sk_buff *skb, struct icmphdr **icmp)
{
struct iphdr *ip;
long ip_off;
int minsz;

ip_off = skb_iphdr_offset(skb);
minsz = ip_off + sizeof(struct iphdr) + sizeof(struct icmphdr) + sizeof(struct iphdr) + 8;

if (skb_shorter(skb, minsz)) {
CALI_DEBUG("ICMP: %d shorter than %d\n", skb_len_dir_access(skb), minsz);
return false;
}

ip = skb_iphdr(skb);

if (ip->ihl != 5) {
CALI_INFO("ICMP: ip options unsupported\n");
return false;
}

*icmp = (struct icmphdr *)(ip + 1);

return true;
}

#endif /* __CALI_ICMP_H__ */
4 changes: 4 additions & 0 deletions bpf-gpl/skb.h
Expand Up @@ -35,6 +35,10 @@
#define skb_ptr(skb, off) ((void *)((long)(skb)->data + (off)))
#define skb_ptr_after(skb, ptr) ((void *)((ptr) + 1))

#define skb_len_dir_access(skb) skb_tail_len(skb, skb_start_ptr(skb))

#define skb_seen(skb) ((skb)->mark & CALI_SKB_MARK_SEEN)

#define IPV4_UDP_SIZE (sizeof(struct iphdr) + sizeof(struct udphdr))
#define ETH_IPV4_UDP_SIZE (sizeof(struct ethhdr) + IPV4_UDP_SIZE)

Expand Down

0 comments on commit e3ece8a

Please sign in to comment.