From 69d05d7f3b82cf07c062fa6baac0aae340ef0531 Mon Sep 17 00:00:00 2001 From: sp-hash Date: Wed, 20 Apr 2016 19:00:14 +0200 Subject: [PATCH] faster groestl-512 --- bitslice_transformations_quad.cu | 121 ++++++++++++++++++------------- quark/cuda_quark_groestl512.cu | 6 +- 2 files changed, 72 insertions(+), 55 deletions(-) diff --git a/bitslice_transformations_quad.cu b/bitslice_transformations_quad.cu index 8dfdb4316..a174ec31c 100644 --- a/bitslice_transformations_quad.cu +++ b/bitslice_transformations_quad.cu @@ -14,13 +14,35 @@ x=__byte_perm(x, y, 0x5410); \ y=__byte_perm(x, y, 0x7632); -#define SWAP4(x,y)\ - t = (y<<4); \ - t = (x ^ t); \ - t = 0xf0f0f0f0UL & t; \ - x = (x ^ t); \ - t= t>>4;\ - y= y ^ t; +__device__ __forceinline__ void SWAP4(uint32_t &x, uint32_t &y, uint32_t m) +{ + uint32_t t = (y << 4); + t = (x ^ t) & m; + // asm("lop3.b32 %0, %1, %2, %3, 0x28;" : "=r"(t) : "r"(x), "r"(t), "r"(m)); //0x28 = (0xF0 ^ 0xCC) & 0xAA + x = (x ^ t); + t = t >> 4; + y = y ^ t; +} + +__device__ __forceinline__ void SWAP2(uint32_t &x, uint32_t &y, uint32_t m) +{ + uint32_t t = (y << 2); + t = (x ^ t) & m; + //asm("lop3.b32 %0, %1, %2, %3, 0x28;" : "=r"(t) : "r"(x), "r"(t), "r"(m)); //0x28 = (0xF0 ^ 0xCC) & 0xAA + x = (x ^ t); + t = t >> 2; + y = y ^ t; +} + +__device__ __forceinline__ void SWAP1(uint32_t &x, uint32_t &y, uint32_t m) +{ + uint32_t t = (y << 1); + t = (x ^ t) & m; +// asm("lop3.b32 %0, %1, %2, %3, 0x28;" : "=r"(t) : "r"(x), "r"(t), "r"(m)); //0x28 = (0xF0 ^ 0xCC) & 0xAA + x = (x ^ t); + t = t >> 1; + y = y ^ t; +} #define SWAP4_final(x,y)\ asm("and.b32 %0, %0, 0x0f0f0f0f;"\ @@ -28,24 +50,6 @@ "vshl.u32.u32.u32.clamp.add %0, %1, 4, %0;\n\t"\ : "+r"(x) : "r"(y));\ - - -#define SWAP2(x,y)\ - t = (y<<2); \ - t = (x ^ t); \ - t = 0xccccccccUL & t; \ - x = (x ^ t); \ - t= t>>2;\ - y= y ^ t; - -#define SWAP1(x,y)\ - t = (y+y); \ - t = (x ^ t); \ - t = 0xaaaaaaaaUL & t; \ - x = (x ^ t); \ - t= t>>1;\ - y= y ^ t; - __device__ __forceinline__ void to_bitslice_quad(uint32_t *const __restrict__ input, uint32_t *const __restrict__ output) { @@ -62,6 +66,10 @@ void to_bitslice_quad(uint32_t *const __restrict__ input, uint32_t *const __rest input[i] = __shfl((int)input[i], threadIdx.x & 2, 4); other[i] = __shfl((int)other[i], threadIdx.x & 2, 4); } + register uint32_t m1 = 0xaaaaaaaaUL; + register uint32_t m2 = 0xccccccccUL; + register uint32_t m4 = 0xf0f0f0f0UL; + merge8(output[0], input[0], input[4], perm); merge8(output[1], other[0], other[4], perm); @@ -72,20 +80,20 @@ void to_bitslice_quad(uint32_t *const __restrict__ input, uint32_t *const __rest merge8(output[6], input[3], input[7], perm); merge8(output[7], other[3], other[7], perm); - SWAP1(output[0], output[1]); - SWAP1(output[2], output[3]); - SWAP1(output[4], output[5]); - SWAP1(output[6], output[7]); + SWAP1(output[0], output[1],m1); + SWAP1(output[2], output[3], m1); + SWAP1(output[4], output[5], m1); + SWAP1(output[6], output[7], m1); - SWAP2(output[0], output[2]); - SWAP2(output[1], output[3]); - SWAP2(output[4], output[6]); - SWAP2(output[5], output[7]); + SWAP2(output[0], output[2], m2); + SWAP2(output[1], output[3], m2); + SWAP2(output[4], output[6], m2); + SWAP2(output[5], output[7], m2); - SWAP4(output[0], output[4]); - SWAP4(output[1], output[5]); - SWAP4(output[2], output[6]); - SWAP4(output[3], output[7]); + SWAP4(output[0], output[4], m4); + SWAP4(output[1], output[5], m4); + SWAP4(output[2], output[6], m4); + SWAP4(output[3], output[7], m4); } __device__ __forceinline__ @@ -94,16 +102,20 @@ void from_bitslice_quad(const uint32_t *const __restrict__ input, uint32_t *cons uint32_t t; const uint32_t perm = 0x7531;//(threadIdx.x & 1) ? 0x3175 : 0x7531; + register uint32_t m1 = 0xaaaaaaaaUL; + register uint32_t m2 = 0xccccccccUL; + register uint32_t m4 = 0xf0f0f0f0UL; + output[0] = __byte_perm(input[0], input[4], perm); output[2] = __byte_perm(input[1], input[5], perm); output[8] = __byte_perm(input[2], input[6], perm); output[10] = __byte_perm(input[3], input[7], perm); - SWAP1(output[0], output[2]); - SWAP1(output[8], output[10]); + SWAP1(output[0], output[2], m1); + SWAP1(output[8], output[10], m1); - SWAP2(output[0], output[8]); - SWAP2(output[2], output[10]); + SWAP2(output[0], output[8], m2); + SWAP2(output[2], output[10], m2); output[4] = __byte_perm(output[0], output[8], 0x5410); output[8] = __byte_perm(output[0], output[8], 0x7632); @@ -113,8 +125,8 @@ void from_bitslice_quad(const uint32_t *const __restrict__ input, uint32_t *cons output[10] = __byte_perm(output[2], output[10], 0x7632); output[2] = output[6]; - SWAP4(output[0], output[8]); - SWAP4(output[2], output[10]); + SWAP4(output[0], output[8], m4); + SWAP4(output[2], output[10], m4); if (threadIdx.x & 1) { @@ -162,6 +174,11 @@ void from_bitslice_quad_final(const uint32_t *const __restrict__ input, uint32_t uint32_t t; const uint32_t perm = 0x7531;//(threadIdx.x & 1) ? 0x3175 : 0x7531; + + register uint32_t m1 = 0xaaaaaaaaUL; + register uint32_t m2 = 0xccccccccUL; + register uint32_t m4 = 0xf0f0f0f0UL; + if (threadIdx.x & 3) { @@ -169,9 +186,9 @@ void from_bitslice_quad_final(const uint32_t *const __restrict__ input, uint32_t output[2] = __byte_perm(input[1], input[5], perm); output[8] = __byte_perm(input[2], input[6], perm); output[10] = __byte_perm(input[3], input[7], perm); - SWAP1(output[0], output[2]); - SWAP1(output[8], output[10]); - SWAP2(output[2], output[10]); + SWAP1(output[0], output[2],m1); + SWAP1(output[8], output[10], m1); + SWAP2(output[2], output[10], m2); output[6] = __byte_perm(output[2], output[10], 0x5410); output[10] = __byte_perm(output[2], output[10], 0x7632); SWAP4_final(output[6], output[10]); @@ -183,11 +200,11 @@ void from_bitslice_quad_final(const uint32_t *const __restrict__ input, uint32_t output[8] = __byte_perm(input[2], input[6], perm); output[10] = __byte_perm(input[3], input[7], perm); - SWAP1(output[0], output[2]); - SWAP1(output[8], output[10]); + SWAP1(output[0], output[2],m1); + SWAP1(output[8], output[10], m1); - SWAP2(output[0], output[8]); - SWAP2(output[2], output[10]); + SWAP2(output[0], output[8], m2); + SWAP2(output[2], output[10], m2); output[4] = __byte_perm(output[0], output[8], 0x5410); output[8] = __byte_perm(output[0], output[8], 0x7632); @@ -197,8 +214,8 @@ void from_bitslice_quad_final(const uint32_t *const __restrict__ input, uint32_t output[10] = __byte_perm(output[2], output[10], 0x7632); output[2] = output[6]; - SWAP4(output[0], output[8]); - SWAP4(output[2], output[10]); + SWAP4(output[0], output[8], m4); + SWAP4(output[2], output[10], m4); if (threadIdx.x & 1) { diff --git a/quark/cuda_quark_groestl512.cu b/quark/cuda_quark_groestl512.cu index ee5aa8b11..0ca56175c 100644 --- a/quark/cuda_quark_groestl512.cu +++ b/quark/cuda_quark_groestl512.cu @@ -16,9 +16,9 @@ __global__ __launch_bounds__(TPB, 2) void quark_groestl512_gpu_hash_64_quad(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector) { - uint32_t msgBitsliced[8]; - uint32_t state[8]; - uint32_t hash[16]; + uint32_t __align__(16) msgBitsliced[8]; + uint32_t __align__(16) state[8]; + uint32_t __align__(16) hash[16]; // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2; if (thread < threads)