Skip to content

Commit

Permalink
hw/net/net_tx_pkt: Implement TCP segmentation
Browse files Browse the repository at this point in the history
There was no proper implementation of TCP segmentation before this
change, and net_tx_pkt relied solely on IPv4 fragmentation. Not only
this is not aligned with the specification, but it also resulted in
corrupted IPv6 packets.

This is particularly problematic for the igb, a new proposed device
implementation; igb provides loopback feature for VMDq and the feature
relies on software segmentation.

Implement proper TCP segmentation in net_tx_pkt to fix such a scenario.

Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
  • Loading branch information
akihikodaki authored and jasowang committed Mar 10, 2023
1 parent ffbd2db commit 02ef5fd
Show file tree
Hide file tree
Showing 3 changed files with 206 additions and 74 deletions.
248 changes: 206 additions & 42 deletions hw/net/net_tx_pkt.c
Expand Up @@ -326,7 +326,8 @@ bool net_tx_pkt_build_vheader(struct NetTxPkt *pkt, bool tso_enable,
case VIRTIO_NET_HDR_GSO_TCPV6:
bytes_read = iov_to_buf(&pkt->vec[NET_TX_PKT_PL_START_FRAG],
pkt->payload_frags, 0, &l4hdr, sizeof(l4hdr));
if (bytes_read < sizeof(l4hdr)) {
if (bytes_read < sizeof(l4hdr) ||
l4hdr.th_off * sizeof(uint32_t) < sizeof(l4hdr)) {
return false;
}

Expand Down Expand Up @@ -466,24 +467,21 @@ void net_tx_pkt_reset(struct NetTxPkt *pkt)
pkt->l4proto = 0;
}

static void net_tx_pkt_do_sw_csum(struct NetTxPkt *pkt)
static void net_tx_pkt_do_sw_csum(struct NetTxPkt *pkt,
struct iovec *iov, uint32_t iov_len,
uint16_t csl)
{
struct iovec *iov = &pkt->vec[NET_TX_PKT_L2HDR_FRAG];
uint32_t csum_cntr;
uint16_t csum = 0;
uint32_t cso;
/* num of iovec without vhdr */
uint32_t iov_len = pkt->payload_frags + NET_TX_PKT_PL_START_FRAG - 1;
uint16_t csl;
size_t csum_offset = pkt->virt_hdr.csum_start + pkt->virt_hdr.csum_offset;
uint16_t l3_proto = eth_get_l3_proto(iov, 1, iov->iov_len);

/* Put zero to checksum field */
iov_from_buf(iov, iov_len, csum_offset, &csum, sizeof csum);

/* Calculate L4 TCP/UDP checksum */
csl = pkt->payload_len;

csum_cntr = 0;
cso = 0;
/* add pseudo header to csum */
Expand All @@ -509,14 +507,13 @@ static void net_tx_pkt_do_sw_csum(struct NetTxPkt *pkt)
#define NET_MAX_FRAG_SG_LIST (64)

static size_t net_tx_pkt_fetch_fragment(struct NetTxPkt *pkt,
int *src_idx, size_t *src_offset, struct iovec *dst, int *dst_idx)
int *src_idx, size_t *src_offset, size_t src_len,
struct iovec *dst, int *dst_idx)
{
size_t fetched = 0;
struct iovec *src = pkt->vec;

*dst_idx = NET_TX_PKT_PL_START_FRAG;

while (fetched < IP_FRAG_ALIGN_SIZE(pkt->virt_hdr.gso_size)) {
while (fetched < src_len) {

/* no more place in fragment iov */
if (*dst_idx == NET_MAX_FRAG_SG_LIST) {
Expand All @@ -531,7 +528,7 @@ static size_t net_tx_pkt_fetch_fragment(struct NetTxPkt *pkt,

dst[*dst_idx].iov_base = src[*src_idx].iov_base + *src_offset;
dst[*dst_idx].iov_len = MIN(src[*src_idx].iov_len - *src_offset,
IP_FRAG_ALIGN_SIZE(pkt->virt_hdr.gso_size) - fetched);
src_len - fetched);

*src_offset += dst[*dst_idx].iov_len;
fetched += dst[*dst_idx].iov_len;
Expand Down Expand Up @@ -560,58 +557,223 @@ static void net_tx_pkt_sendv(
}
}

static bool net_tx_pkt_tcp_fragment_init(struct NetTxPkt *pkt,
struct iovec *fragment,
int *pl_idx,
size_t *l4hdr_len,
int *src_idx,
size_t *src_offset,
size_t *src_len)
{
struct iovec *l4 = fragment + NET_TX_PKT_PL_START_FRAG;
size_t bytes_read = 0;
struct tcp_hdr *th;

if (!pkt->payload_frags) {
return false;
}

l4->iov_len = pkt->virt_hdr.hdr_len - pkt->hdr_len;
l4->iov_base = g_malloc(l4->iov_len);

*src_idx = NET_TX_PKT_PL_START_FRAG;
while (pkt->vec[*src_idx].iov_len < l4->iov_len - bytes_read) {
memcpy((char *)l4->iov_base + bytes_read, pkt->vec[*src_idx].iov_base,
pkt->vec[*src_idx].iov_len);

bytes_read += pkt->vec[*src_idx].iov_len;

(*src_idx)++;
if (*src_idx >= pkt->payload_frags + NET_TX_PKT_PL_START_FRAG) {
g_free(l4->iov_base);
return false;
}
}

*src_offset = l4->iov_len - bytes_read;
memcpy((char *)l4->iov_base + bytes_read, pkt->vec[*src_idx].iov_base,
*src_offset);

th = l4->iov_base;
th->th_flags &= ~(TH_FIN | TH_PUSH);

*pl_idx = NET_TX_PKT_PL_START_FRAG + 1;
*l4hdr_len = l4->iov_len;
*src_len = pkt->virt_hdr.gso_size;

return true;
}

static void net_tx_pkt_tcp_fragment_deinit(struct iovec *fragment)
{
g_free(fragment[NET_TX_PKT_PL_START_FRAG].iov_base);
}

static void net_tx_pkt_tcp_fragment_fix(struct NetTxPkt *pkt,
struct iovec *fragment,
size_t fragment_len,
uint8_t gso_type)
{
struct iovec *l3hdr = fragment + NET_TX_PKT_L3HDR_FRAG;
struct iovec *l4hdr = fragment + NET_TX_PKT_PL_START_FRAG;
struct ip_header *ip = l3hdr->iov_base;
struct ip6_header *ip6 = l3hdr->iov_base;
size_t len = l3hdr->iov_len + l4hdr->iov_len + fragment_len;

switch (gso_type) {
case VIRTIO_NET_HDR_GSO_TCPV4:
ip->ip_len = cpu_to_be16(len);
eth_fix_ip4_checksum(l3hdr->iov_base, l3hdr->iov_len);
break;

case VIRTIO_NET_HDR_GSO_TCPV6:
len -= sizeof(struct ip6_header);
ip6->ip6_ctlun.ip6_un1.ip6_un1_plen = cpu_to_be16(len);
break;
}
}

static void net_tx_pkt_tcp_fragment_advance(struct NetTxPkt *pkt,
struct iovec *fragment,
size_t fragment_len,
uint8_t gso_type)
{
struct iovec *l3hdr = fragment + NET_TX_PKT_L3HDR_FRAG;
struct iovec *l4hdr = fragment + NET_TX_PKT_PL_START_FRAG;
struct ip_header *ip = l3hdr->iov_base;
struct tcp_hdr *th = l4hdr->iov_base;

if (gso_type == VIRTIO_NET_HDR_GSO_TCPV4) {
ip->ip_id = cpu_to_be16(be16_to_cpu(ip->ip_id) + 1);
}

th->th_seq = cpu_to_be32(be32_to_cpu(th->th_seq) + fragment_len);
th->th_flags &= ~TH_CWR;
}

static void net_tx_pkt_udp_fragment_init(struct NetTxPkt *pkt,
int *pl_idx,
size_t *l4hdr_len,
int *src_idx, size_t *src_offset,
size_t *src_len)
{
*pl_idx = NET_TX_PKT_PL_START_FRAG;
*l4hdr_len = 0;
*src_idx = NET_TX_PKT_PL_START_FRAG;
*src_offset = 0;
*src_len = IP_FRAG_ALIGN_SIZE(pkt->virt_hdr.gso_size);
}

static void net_tx_pkt_udp_fragment_fix(struct NetTxPkt *pkt,
struct iovec *fragment,
size_t fragment_offset,
size_t fragment_len)
{
bool more_frags = fragment_offset + fragment_len < pkt->payload_len;
uint16_t orig_flags;
struct iovec *l3hdr = fragment + NET_TX_PKT_L3HDR_FRAG;
struct ip_header *ip = l3hdr->iov_base;
uint16_t frag_off_units = fragment_offset / IP_FRAG_UNIT_SIZE;
uint16_t new_ip_off;

assert(fragment_offset % IP_FRAG_UNIT_SIZE == 0);
assert((frag_off_units & ~IP_OFFMASK) == 0);

orig_flags = be16_to_cpu(ip->ip_off) & ~(IP_OFFMASK | IP_MF);
new_ip_off = frag_off_units | orig_flags | (more_frags ? IP_MF : 0);
ip->ip_off = cpu_to_be16(new_ip_off);
ip->ip_len = cpu_to_be16(l3hdr->iov_len + fragment_len);

eth_fix_ip4_checksum(l3hdr->iov_base, l3hdr->iov_len);
}

static bool net_tx_pkt_do_sw_fragmentation(struct NetTxPkt *pkt,
NetTxPktCallback callback,
void *context)
{
uint8_t gso_type = pkt->virt_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN;

struct iovec fragment[NET_MAX_FRAG_SG_LIST];
size_t fragment_len = 0;
bool more_frags = false;

/* some pointers for shorter code */
void *l2_iov_base, *l3_iov_base;
size_t l2_iov_len, l3_iov_len;
int src_idx = NET_TX_PKT_PL_START_FRAG, dst_idx;
size_t src_offset = 0;
size_t fragment_len;
size_t l4hdr_len;
size_t src_len;

int src_idx, dst_idx, pl_idx;
size_t src_offset;
size_t fragment_offset = 0;
struct virtio_net_hdr virt_hdr = {
.flags = pkt->virt_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM ?
VIRTIO_NET_HDR_F_DATA_VALID : 0
};

l2_iov_base = pkt->vec[NET_TX_PKT_L2HDR_FRAG].iov_base;
l2_iov_len = pkt->vec[NET_TX_PKT_L2HDR_FRAG].iov_len;
l3_iov_base = pkt->vec[NET_TX_PKT_L3HDR_FRAG].iov_base;
l3_iov_len = pkt->vec[NET_TX_PKT_L3HDR_FRAG].iov_len;

/* Copy headers */
fragment[NET_TX_PKT_VHDR_FRAG].iov_base = &virt_hdr;
fragment[NET_TX_PKT_VHDR_FRAG].iov_len = sizeof(virt_hdr);
fragment[NET_TX_PKT_L2HDR_FRAG].iov_base = l2_iov_base;
fragment[NET_TX_PKT_L2HDR_FRAG].iov_len = l2_iov_len;
fragment[NET_TX_PKT_L3HDR_FRAG].iov_base = l3_iov_base;
fragment[NET_TX_PKT_L3HDR_FRAG].iov_len = l3_iov_len;
fragment[NET_TX_PKT_L2HDR_FRAG] = pkt->vec[NET_TX_PKT_L2HDR_FRAG];
fragment[NET_TX_PKT_L3HDR_FRAG] = pkt->vec[NET_TX_PKT_L3HDR_FRAG];

switch (gso_type) {
case VIRTIO_NET_HDR_GSO_TCPV4:
case VIRTIO_NET_HDR_GSO_TCPV6:
if (!net_tx_pkt_tcp_fragment_init(pkt, fragment, &pl_idx, &l4hdr_len,
&src_idx, &src_offset, &src_len)) {
return false;
}
break;

/* Put as much data as possible and send */
do {
fragment_len = net_tx_pkt_fetch_fragment(pkt, &src_idx, &src_offset,
fragment, &dst_idx);
case VIRTIO_NET_HDR_GSO_UDP:
net_tx_pkt_do_sw_csum(pkt, &pkt->vec[NET_TX_PKT_L2HDR_FRAG],
pkt->payload_frags + NET_TX_PKT_PL_START_FRAG - 1,
pkt->payload_len);
net_tx_pkt_udp_fragment_init(pkt, &pl_idx, &l4hdr_len,
&src_idx, &src_offset, &src_len);
break;

more_frags = (fragment_offset + fragment_len < pkt->payload_len);
default:
abort();
}

eth_setup_ip4_fragmentation(l2_iov_base, l2_iov_len, l3_iov_base,
l3_iov_len, fragment_len, fragment_offset, more_frags);
/* Put as much data as possible and send */
while (true) {
dst_idx = pl_idx;
fragment_len = net_tx_pkt_fetch_fragment(pkt,
&src_idx, &src_offset, src_len, fragment, &dst_idx);
if (!fragment_len) {
break;
}

eth_fix_ip4_checksum(l3_iov_base, l3_iov_len);
switch (gso_type) {
case VIRTIO_NET_HDR_GSO_TCPV4:
case VIRTIO_NET_HDR_GSO_TCPV6:
net_tx_pkt_tcp_fragment_fix(pkt, fragment, fragment_len, gso_type);
net_tx_pkt_do_sw_csum(pkt, fragment + NET_TX_PKT_L2HDR_FRAG,
dst_idx - NET_TX_PKT_L2HDR_FRAG,
l4hdr_len + fragment_len);
break;

case VIRTIO_NET_HDR_GSO_UDP:
net_tx_pkt_udp_fragment_fix(pkt, fragment, fragment_offset,
fragment_len);
break;
}

callback(context,
fragment + NET_TX_PKT_L2HDR_FRAG, dst_idx - NET_TX_PKT_L2HDR_FRAG,
fragment + NET_TX_PKT_VHDR_FRAG, dst_idx - NET_TX_PKT_VHDR_FRAG);

if (gso_type == VIRTIO_NET_HDR_GSO_TCPV4 ||
gso_type == VIRTIO_NET_HDR_GSO_TCPV6) {
net_tx_pkt_tcp_fragment_advance(pkt, fragment, fragment_len,
gso_type);
}

fragment_offset += fragment_len;
}

} while (fragment_len && more_frags);
if (gso_type == VIRTIO_NET_HDR_GSO_TCPV4 ||
gso_type == VIRTIO_NET_HDR_GSO_TCPV6) {
net_tx_pkt_tcp_fragment_deinit(fragment);
}

return true;
}
Expand All @@ -627,10 +789,6 @@ bool net_tx_pkt_send_custom(struct NetTxPkt *pkt, bool offload,
{
assert(pkt);

if (!offload && pkt->virt_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
net_tx_pkt_do_sw_csum(pkt);
}

/*
* Since underlying infrastructure does not support IP datagrams longer
* than 64K we should drop such packets and don't even try to send
Expand All @@ -644,6 +802,12 @@ bool net_tx_pkt_send_custom(struct NetTxPkt *pkt, bool offload,
}

if (offload || pkt->virt_hdr.gso_type == VIRTIO_NET_HDR_GSO_NONE) {
if (!offload && pkt->virt_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
net_tx_pkt_do_sw_csum(pkt, &pkt->vec[NET_TX_PKT_L2HDR_FRAG],
pkt->payload_frags + NET_TX_PKT_PL_START_FRAG - 1,
pkt->payload_len);
}

net_tx_pkt_fix_ip6_payload_len(pkt);
callback(context, pkt->vec + NET_TX_PKT_L2HDR_FRAG,
pkt->payload_frags + NET_TX_PKT_PL_START_FRAG - NET_TX_PKT_L2HDR_FRAG,
Expand Down
5 changes: 0 additions & 5 deletions include/net/eth.h
Expand Up @@ -400,11 +400,6 @@ void eth_get_protocols(const struct iovec *iov, int iovcnt,
eth_ip4_hdr_info *ip4hdr_info,
eth_l4_hdr_info *l4hdr_info);

void eth_setup_ip4_fragmentation(const void *l2hdr, size_t l2hdr_len,
void *l3hdr, size_t l3hdr_len,
size_t l3payload_len,
size_t frag_offset, bool more_frags);

void
eth_fix_ip4_checksum(void *l3hdr, size_t l3hdr_len);

Expand Down
27 changes: 0 additions & 27 deletions net/eth.c
Expand Up @@ -314,33 +314,6 @@ eth_strip_vlan_ex(const struct iovec *iov, int iovcnt, size_t iovoff,
return 0;
}

void
eth_setup_ip4_fragmentation(const void *l2hdr, size_t l2hdr_len,
void *l3hdr, size_t l3hdr_len,
size_t l3payload_len,
size_t frag_offset, bool more_frags)
{
const struct iovec l2vec = {
.iov_base = (void *) l2hdr,
.iov_len = l2hdr_len
};

if (eth_get_l3_proto(&l2vec, 1, l2hdr_len) == ETH_P_IP) {
uint16_t orig_flags;
struct ip_header *iphdr = (struct ip_header *) l3hdr;
uint16_t frag_off_units = frag_offset / IP_FRAG_UNIT_SIZE;
uint16_t new_ip_off;

assert(frag_offset % IP_FRAG_UNIT_SIZE == 0);
assert((frag_off_units & ~IP_OFFMASK) == 0);

orig_flags = be16_to_cpu(iphdr->ip_off) & ~(IP_OFFMASK|IP_MF);
new_ip_off = frag_off_units | orig_flags | (more_frags ? IP_MF : 0);
iphdr->ip_off = cpu_to_be16(new_ip_off);
iphdr->ip_len = cpu_to_be16(l3payload_len + l3hdr_len);
}
}

void
eth_fix_ip4_checksum(void *l3hdr, size_t l3hdr_len)
{
Expand Down

0 comments on commit 02ef5fd

Please sign in to comment.