Skip to content

Commit

Permalink
eal: optimize aligned memcpy on x86
Browse files Browse the repository at this point in the history
This patch optimizes rte_memcpy for well aligned cases, where both
dst and src addr are aligned to maximum MOV width. It introduces a
dedicated function called rte_memcpy_aligned to handle the aligned
cases with simplified instruction stream. The existing rte_memcpy
is renamed as rte_memcpy_generic. The selection between them 2 is
done at the entry of rte_memcpy.

The existing rte_memcpy is for generic cases, it handles unaligned
copies and make store aligned, it even makes load aligned for micro
architectures like Ivy Bridge. However alignment handling comes at
a price: It adds extra load/store instructions, which can cause
complications sometime.

DPDK Vhost memcpy with Mergeable Rx Buffer feature as an example:
The copy is aligned, and remote, and there is header write along
which is also remote. In this case the memcpy instruction stream
should be simplified, to reduce extra load/store, therefore reduce
the probability of load/store buffer full caused pipeline stall, to
let the actual memcpy instructions be issued and let H/W prefetcher
goes to work as early as possible.

This patch is tested on Ivy Bridge, Haswell and Skylake, it provides
up to 20% gain for Virtio Vhost PVP traffic, with packet size ranging
from 64 to 1500 bytes.

The test can also be conducted without NIC, by setting loopback
traffic between Virtio and Vhost. For example, modify the macro
TXONLY_DEF_PACKET_LEN to the requested packet size in testpmd.h,
rebuild and start testpmd in both host and guest, then "start" on
one side and "start tx_first 32" on the other.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
Reviewed-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Tested-by: Lei Yao <lei.a.yao@intel.com>
  • Loading branch information
johnakawzh authored and Thomas Monjalon committed Jan 17, 2017
1 parent e2a6f12 commit f547270
Showing 1 changed file with 78 additions and 3 deletions.
81 changes: 78 additions & 3 deletions lib/librte_eal/common/include/arch/x86/rte_memcpy.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ rte_memcpy(void *dst, const void *src, size_t n) __attribute__((always_inline));

#ifdef RTE_MACHINE_CPUFLAG_AVX512F

#define ALIGNMENT_MASK 0x3F

/**
* AVX512 implementation below
*/
Expand Down Expand Up @@ -189,7 +191,7 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
}

static inline void *
rte_memcpy(void *dst, const void *src, size_t n)
rte_memcpy_generic(void *dst, const void *src, size_t n)
{
uintptr_t dstu = (uintptr_t)dst;
uintptr_t srcu = (uintptr_t)src;
Expand Down Expand Up @@ -308,6 +310,8 @@ rte_memcpy(void *dst, const void *src, size_t n)

#elif defined RTE_MACHINE_CPUFLAG_AVX2

#define ALIGNMENT_MASK 0x1F

/**
* AVX2 implementation below
*/
Expand Down Expand Up @@ -387,7 +391,7 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
}

static inline void *
rte_memcpy(void *dst, const void *src, size_t n)
rte_memcpy_generic(void *dst, const void *src, size_t n)
{
uintptr_t dstu = (uintptr_t)dst;
uintptr_t srcu = (uintptr_t)src;
Expand Down Expand Up @@ -499,6 +503,8 @@ rte_memcpy(void *dst, const void *src, size_t n)

#else /* RTE_MACHINE_CPUFLAG */

#define ALIGNMENT_MASK 0x0F

/**
* SSE & AVX implementation below
*/
Expand Down Expand Up @@ -677,7 +683,7 @@ __extension__ ({ \
})

static inline void *
rte_memcpy(void *dst, const void *src, size_t n)
rte_memcpy_generic(void *dst, const void *src, size_t n)
{
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
uintptr_t dstu = (uintptr_t)dst;
Expand Down Expand Up @@ -821,6 +827,75 @@ rte_memcpy(void *dst, const void *src, size_t n)

#endif /* RTE_MACHINE_CPUFLAG */

static inline void *
rte_memcpy_aligned(void *dst, const void *src, size_t n)
{
void *ret = dst;

/* Copy size <= 16 bytes */
if (n < 16) {
if (n & 0x01) {
*(uint8_t *)dst = *(const uint8_t *)src;
src = (const uint8_t *)src + 1;
dst = (uint8_t *)dst + 1;
}
if (n & 0x02) {
*(uint16_t *)dst = *(const uint16_t *)src;
src = (const uint16_t *)src + 1;
dst = (uint16_t *)dst + 1;
}
if (n & 0x04) {
*(uint32_t *)dst = *(const uint32_t *)src;
src = (const uint32_t *)src + 1;
dst = (uint32_t *)dst + 1;
}
if (n & 0x08)
*(uint64_t *)dst = *(const uint64_t *)src;

return ret;
}

/* Copy 16 <= size <= 32 bytes */
if (n <= 32) {
rte_mov16((uint8_t *)dst, (const uint8_t *)src);
rte_mov16((uint8_t *)dst - 16 + n,
(const uint8_t *)src - 16 + n);

return ret;
}

/* Copy 32 < size <= 64 bytes */
if (n <= 64) {
rte_mov32((uint8_t *)dst, (const uint8_t *)src);
rte_mov32((uint8_t *)dst - 32 + n,
(const uint8_t *)src - 32 + n);

return ret;
}

/* Copy 64 bytes blocks */
for (; n >= 64; n -= 64) {
rte_mov64((uint8_t *)dst, (const uint8_t *)src);
dst = (uint8_t *)dst + 64;
src = (const uint8_t *)src + 64;
}

/* Copy whatever left */
rte_mov64((uint8_t *)dst - 64 + n,
(const uint8_t *)src - 64 + n);

return ret;
}

static inline void *
rte_memcpy(void *dst, const void *src, size_t n)
{
if (!(((uintptr_t)dst | (uintptr_t)src) & ALIGNMENT_MASK))
return rte_memcpy_aligned(dst, src, n);
else
return rte_memcpy_generic(dst, src, n);
}

#ifdef __cplusplus
}
#endif
Expand Down

0 comments on commit f547270

Please sign in to comment.