Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PMULL implementation for CRC computation #4082

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
141 changes: 139 additions & 2 deletions src/crc16.c
Expand Up @@ -3,6 +3,7 @@
/*
* Copyright 2001-2010 Georges Menie (www.menie.org)
* Copyright 2010-2012 Salvatore Sanfilippo (adapted to Redis coding style)
* Copyright 2017 ARM Ltd.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -79,10 +80,146 @@ static const uint16_t crc16tab[256]= {
0x6e17,0x7e36,0x4e55,0x5e74,0x2e93,0x3eb2,0x0ed1,0x1ef0
};

#if defined(__ARM_NEON) && defined(__ARM_FEATURE_CRYPTO)
#define HAVE_CLMUL 1

#include <arm_neon.h>

#define __shift_p128_left(data, imm) vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64((data)), (imm)))
#define __shift_p128_right(data, imm) vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64((data)), vdupq_n_u8(0), (imm)))

static inline uint64x2_t endian_swap(uint64x2_t val)
{
return (uint64x2_t)__builtin_shuffle((uint8x16_t)val, (uint8x16_t){ 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0});
}

static inline uint64x2_t fold_128b(uint64x2_t to, const uint64x2_t from, const uint64x2_t constant)
{
uint64x2_t tmp_h = (uint64x2_t)vmull_p64((poly64_t)vgetq_lane_u64(from, 1), (poly64_t)vgetq_lane_u64(constant, 1));
uint64x2_t tmp_l = (uint64x2_t)vmull_p64((poly64_t)vgetq_lane_u64(from, 0), (poly64_t)vgetq_lane_u64(constant, 0));
return veorq_u64(tmp_l, veorq_u64(to, tmp_h));
}

static inline uint64x2_t crc16_fold(uint64x2_t from, const uint64x2_t constant)
{
uint64x2_t tmp = from;

from = (uint64x2_t)vmull_p64((poly64_t)vgetq_lane_u64(from, 1), (poly64_t)vgetq_lane_u64(constant, 1));

/* get from:low_64b + 32b '0' appended (96b total) */
tmp = __shift_p128_right(__shift_p128_left(tmp, 8), 4);
from = veorq_u64(from, tmp);

/* 96bit --> 64bit */
tmp = from;
from = (uint64x2_t)vmull_p64((poly64_t)vgetq_lane_u64(from, 1), (poly64_t)vgetq_lane_u64(constant, 0));
return veorq_u64(from, tmp);
}

static inline uint64_t crc16_barrett_reduction(uint64x2_t data, const uint64x2_t p_q)
{
uint64x2_t tmp = vcombine_u64((uint64x1_t)vgetq_lane_u64(data, 0), (uint64x1_t)0ULL);

/* T1 = floor(R(x)/x^32) * [1/P(x)]; */
tmp = (uint64x2_t)vmull_p64((poly64_t)vgetq_lane_u64(__shift_p128_right(tmp, 4), 0), (poly64_t)vgetq_lane_u64(p_q, 0));
/* T2 = floor(T1/x^32) * P(x) */
tmp = (uint64x2_t)vmull_p64((poly64_t)vgetq_lane_u64(__shift_p128_right(tmp, 4), 0), (poly64_t)vgetq_lane_u64(p_q, 1));
/* R-int(R/P)*P */
data = veorq_u64(tmp, data);

return vgetq_lane_u64(data, 0);
}

/*
* crc16_clmul assumes:
* 1. input buffer s is 16-byte aligned
* 2. buffer length is 64*N bytes
*
*/
static uint64_t crc16_clmul(uint64_t crc, const unsigned char *s, uint64_t l)
{
/* pre-computed constants */
const uint64x2_t foldConstants_p4 = {0x0000000059b00000ULL, 0x0000000060190000ULL};
const uint64x2_t foldConstants_p1 = {0x0000000045630000ULL, 0x00000000d5f60000ULL};
const uint64x2_t foldConstants_p0 = {0x00000000aa510000ULL, 0x00000000eb230000ULL};
const uint64x2_t foldConstants_br = {0x0000000111303471ULL, 0x0000000110210000ULL};

crc <<= 48;
uint64x2_t *p_data = (uint64x2_t *)s;
uint64_t remain_len = l;

uint64x2_t x0, x1, x2, x3;
uint64x2_t y0, y1, y2, y3;

/* expand crc to 128bit */
y0 = vcombine_u64((uint64x1_t)0ULL, (uint64x1_t)crc);
/* load first 64B */
x0 = *p_data++; x0 = endian_swap(x0);
x1 = *p_data++; x1 = endian_swap(x1);
x2 = *p_data++; x2 = endian_swap(x2);
x3 = *p_data++; x3 = endian_swap(x3);

remain_len -= 64;

x0 = x0 ^ y0; /* x0 ^ crc */

/* 1024bit --> 512bit loop */
while(remain_len >= 64) {
y0 = *p_data++; y0 = endian_swap(y0);
y1 = *p_data++; y1 = endian_swap(y1);
y2 = *p_data++; y2 = endian_swap(y2);
y3 = *p_data++; y3 = endian_swap(y3);

x0 = fold_128b(y0, x0, foldConstants_p4);
x1 = fold_128b(y1, x1, foldConstants_p4);
x2 = fold_128b(y2, x2, foldConstants_p4);
x3 = fold_128b(y3, x3, foldConstants_p4);

remain_len -= 64;
}

/* folding 512bit --> 128bit */
x1 = fold_128b(x1, x0, foldConstants_p1);
x2 = fold_128b(x2, x1, foldConstants_p1);
x0 = fold_128b(x3, x2, foldConstants_p1);

x0 = crc16_fold(x0, foldConstants_p0);

/* calc crc using barrett reduction method */
crc = crc16_barrett_reduction(x0, foldConstants_br);
crc >>= 16;

return crc;
}
#endif

uint16_t crc16(const char *buf, int len) {
int counter;
int counter = 0;
uint16_t crc = 0;
for (counter = 0; counter < len; counter++)

#if HAVE_CLMUL
uint64_t remain = len;
uint64_t l = len;
/* make sure 16-byte aligned for CLMUL routine */
if ((uintptr_t)buf & 15) {
int64_t n = (l > (-(uintptr_t)buf & 15))? -(uintptr_t)buf & 15 : l;
for (; counter<n; counter++) {
crc = (crc<<8) ^ crc16tab[((crc>>8) ^ *buf++)&0x00FF];
}
remain -= n;
}
if (remain >= 1*1024) {
/*
* Use CLMUL to compute CRC for a "large" block
* this block is 64*N bytes
*/
crc = crc16_clmul(crc, (unsigned char *)(buf+counter), remain&(~63ULL));
counter += remain & (~63ULL);
buf += counter;
}
#endif

for (; counter < len; counter++)
crc = (crc<<8) ^ crc16tab[((crc>>8) ^ *buf++)&0x00FF];
return crc;
}
115 changes: 113 additions & 2 deletions src/crc64.c
Expand Up @@ -11,6 +11,7 @@
* Check("123456789"): 0xe9c6d914c4b8d9ca
*
* Copyright (c) 2012, Salvatore Sanfilippo <antirez at gmail dot com>
* Copyright (c) 2017 ARM Ltd
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -39,6 +40,95 @@

#include <stdint.h>

#if defined(__ARM_NEON) && defined(__ARM_FEATURE_CRYPTO)
#define HAVE_CLMUL 1

#include <arm_neon.h>

#define __shift_p128_left(data, imm) vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64((data)), (imm)))
#define __shift_p128_right(data, imm) vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64((data)), vdupq_n_u8(0), (imm)))

static inline uint64x2_t fold_128b(uint64x2_t to, const uint64x2_t from, const uint64x2_t constant)
{
uint64x2_t tmp_h = (uint64x2_t)vmull_p64((poly64_t)vgetq_lane_u64(from, 1), (poly64_t)vgetq_lane_u64(constant, 1));
uint64x2_t tmp_l = (uint64x2_t)vmull_p64((poly64_t)vgetq_lane_u64(from, 0), (poly64_t)vgetq_lane_u64(constant, 0));
return veorq_u64(tmp_l, veorq_u64(to, tmp_h));
}

static inline uint64x2_t crc64_fold_(uint64x2_t from, const uint64x2_t constant)
{
uint64x2_t tmp = __shift_p128_right(from, 8);
from = (uint64x2_t)vmull_p64((poly64_t)vgetq_lane_u64(from, 0), (poly64_t)vgetq_lane_u64(constant, 0));
return veorq_u64(from, tmp);
}

static inline uint64_t crc64_barrett_reduction(uint64x2_t data, const uint64x2_t p_q)
{
uint64x2_t tmp1 = data;
uint64x2_t tmp2 = (uint64x2_t)vmull_p64((poly64_t)vgetq_lane_u64(data, 0), (poly64_t)vgetq_lane_u64(p_q, 0));
data = (uint64x2_t)vmull_p64((poly64_t)vgetq_lane_u64(tmp2, 0), (poly64_t)vgetq_lane_u64(p_q, 1));
tmp2 = __shift_p128_left(tmp2, 8);
data = veorq_u64(data, tmp1);
data = veorq_u64(data, tmp2);

return vgetq_lane_u64(data, 1);
}

/*
* crc64_clmul assumes:
* 1. input buffer s is 16-byte aligned
* 2. buffer length is 64*N bytes
*
*/
static uint64_t crc64_clmul(uint64_t crc, const unsigned char *s, uint64_t l)
{
/* pre-computed constants */
const uint64x2_t foldConstants_p4 = {0xaf86efb16d9ab4fbULL, 0xf49784a634f014e4ULL};
const uint64x2_t foldConstants_p1 = {0xd9d7be7d505da32cULL, 0x381d0015c96f4444ULL};
const uint64x2_t foldConstants_p0 = {0x381d0015c96f4444ULL, 0x0ULL};
const uint64x2_t foldConstants_br = {0x3e6cfa329aef9f77ULL, 0x2b5926535897936bULL};

uint64x2_t *p_data = (uint64x2_t *)s;
uint64_t remain_len = l;

uint64x2_t x0, x1, x2, x3;
uint64x2_t y0, y1, y2, y3;

/* expand crc to 128bit */
y0 = vcombine_u64((uint64x1_t)crc, (uint64x1_t)0ULL);
/* load first 64B */
x0 = *p_data++; x1 = *p_data++; x2 = *p_data++; x3 = *p_data++;
remain_len -= 64;

x0 = x0 ^ y0; /* x0 ^ crc */

/* 1024bit --> 512bit loop */
while(remain_len >= 64) {
y0 = *p_data++; y1 = *p_data++;
y2 = *p_data++; y3 = *p_data++;

x0 = fold_128b(y0, x0, foldConstants_p4);
x1 = fold_128b(y1, x1, foldConstants_p4);
x2 = fold_128b(y2, x2, foldConstants_p4);
x3 = fold_128b(y3, x3, foldConstants_p4);

remain_len -= 64;
}

/* folding 512bit --> 128bit */
x1 = fold_128b(x1, x0, foldConstants_p1);
x2 = fold_128b(x2, x1, foldConstants_p1);
x0 = fold_128b(x3, x2, foldConstants_p1);

x0 = crc64_fold_(x0, foldConstants_p0);

/* calc crc using barrett reduction method */
crc = crc64_barrett_reduction(x0, foldConstants_br);

return crc;
}
#endif

static const uint64_t crc64_tab[256] = {
UINT64_C(0x0000000000000000), UINT64_C(0x7ad870c830358979),
UINT64_C(0xf5b0e190606b12f2), UINT64_C(0x8f689158505e9b8b),
Expand Down Expand Up @@ -171,9 +261,30 @@ static const uint64_t crc64_tab[256] = {
};

uint64_t crc64(uint64_t crc, const unsigned char *s, uint64_t l) {
uint64_t j;
uint64_t j = 0;

for (j = 0; j < l; j++) {
#if HAVE_CLMUL
uint64_t remain = l;
/* make sure 16-byte aligned for CLMUL routine */
if ((uintptr_t)s & 15) {
uint64_t n = (l > (-(uintptr_t)s & 15))? -(uintptr_t)s & 15 : l;
for (; j<n; j++) {
uint8_t byte = s[j];
crc = crc64_tab[(uint8_t)crc ^ byte] ^ (crc >> 8);
}
remain -= n;
}
if (remain >= 1*1024) {
/*
* Use CLMUL to compute CRC for a "large" block
* this block is 64*N bytes
*/
crc = crc64_clmul(crc, s+j, remain&(~63ULL));
j += remain & (~63ULL);
remain &= 63ULL;
}
#endif
for (; j < l; j++) {
uint8_t byte = s[j];
crc = crc64_tab[(uint8_t)crc ^ byte] ^ (crc >> 8);
}
Expand Down