Skip to content

Commit

Permalink
faster groestl-512
Browse files Browse the repository at this point in the history
  • Loading branch information
runestensland committed Apr 20, 2016
1 parent ea1820a commit 69d05d7
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 55 deletions.
121 changes: 69 additions & 52 deletions bitslice_transformations_quad.cu
Expand Up @@ -14,38 +14,42 @@
x=__byte_perm(x, y, 0x5410); \
y=__byte_perm(x, y, 0x7632);

#define SWAP4(x,y)\
t = (y<<4); \
t = (x ^ t); \
t = 0xf0f0f0f0UL & t; \
x = (x ^ t); \
t= t>>4;\
y= y ^ t;
__device__ __forceinline__ void SWAP4(uint32_t &x, uint32_t &y, uint32_t m)
{
uint32_t t = (y << 4);
t = (x ^ t) & m;
// asm("lop3.b32 %0, %1, %2, %3, 0x28;" : "=r"(t) : "r"(x), "r"(t), "r"(m)); //0x28 = (0xF0 ^ 0xCC) & 0xAA
x = (x ^ t);
t = t >> 4;
y = y ^ t;
}

__device__ __forceinline__ void SWAP2(uint32_t &x, uint32_t &y, uint32_t m)
{
uint32_t t = (y << 2);
t = (x ^ t) & m;
//asm("lop3.b32 %0, %1, %2, %3, 0x28;" : "=r"(t) : "r"(x), "r"(t), "r"(m)); //0x28 = (0xF0 ^ 0xCC) & 0xAA
x = (x ^ t);
t = t >> 2;
y = y ^ t;
}

__device__ __forceinline__ void SWAP1(uint32_t &x, uint32_t &y, uint32_t m)
{
uint32_t t = (y << 1);
t = (x ^ t) & m;
// asm("lop3.b32 %0, %1, %2, %3, 0x28;" : "=r"(t) : "r"(x), "r"(t), "r"(m)); //0x28 = (0xF0 ^ 0xCC) & 0xAA
x = (x ^ t);
t = t >> 1;
y = y ^ t;
}

#define SWAP4_final(x,y)\
asm("and.b32 %0, %0, 0x0f0f0f0f;"\
"and.b32 %1, %1, 0x0f0f0f0f;"\
"vshl.u32.u32.u32.clamp.add %0, %1, 4, %0;\n\t"\
: "+r"(x) : "r"(y));\



#define SWAP2(x,y)\
t = (y<<2); \
t = (x ^ t); \
t = 0xccccccccUL & t; \
x = (x ^ t); \
t= t>>2;\
y= y ^ t;

#define SWAP1(x,y)\
t = (y+y); \
t = (x ^ t); \
t = 0xaaaaaaaaUL & t; \
x = (x ^ t); \
t= t>>1;\
y= y ^ t;

__device__ __forceinline__
void to_bitslice_quad(uint32_t *const __restrict__ input, uint32_t *const __restrict__ output)
{
Expand All @@ -62,6 +66,10 @@ void to_bitslice_quad(uint32_t *const __restrict__ input, uint32_t *const __rest
input[i] = __shfl((int)input[i], threadIdx.x & 2, 4);
other[i] = __shfl((int)other[i], threadIdx.x & 2, 4);
}
register uint32_t m1 = 0xaaaaaaaaUL;
register uint32_t m2 = 0xccccccccUL;
register uint32_t m4 = 0xf0f0f0f0UL;


merge8(output[0], input[0], input[4], perm);
merge8(output[1], other[0], other[4], perm);
Expand All @@ -72,20 +80,20 @@ void to_bitslice_quad(uint32_t *const __restrict__ input, uint32_t *const __rest
merge8(output[6], input[3], input[7], perm);
merge8(output[7], other[3], other[7], perm);

SWAP1(output[0], output[1]);
SWAP1(output[2], output[3]);
SWAP1(output[4], output[5]);
SWAP1(output[6], output[7]);
SWAP1(output[0], output[1],m1);
SWAP1(output[2], output[3], m1);
SWAP1(output[4], output[5], m1);
SWAP1(output[6], output[7], m1);

SWAP2(output[0], output[2]);
SWAP2(output[1], output[3]);
SWAP2(output[4], output[6]);
SWAP2(output[5], output[7]);
SWAP2(output[0], output[2], m2);
SWAP2(output[1], output[3], m2);
SWAP2(output[4], output[6], m2);
SWAP2(output[5], output[7], m2);

SWAP4(output[0], output[4]);
SWAP4(output[1], output[5]);
SWAP4(output[2], output[6]);
SWAP4(output[3], output[7]);
SWAP4(output[0], output[4], m4);
SWAP4(output[1], output[5], m4);
SWAP4(output[2], output[6], m4);
SWAP4(output[3], output[7], m4);
}

__device__ __forceinline__
Expand All @@ -94,16 +102,20 @@ void from_bitslice_quad(const uint32_t *const __restrict__ input, uint32_t *cons
uint32_t t;
const uint32_t perm = 0x7531;//(threadIdx.x & 1) ? 0x3175 : 0x7531;

register uint32_t m1 = 0xaaaaaaaaUL;
register uint32_t m2 = 0xccccccccUL;
register uint32_t m4 = 0xf0f0f0f0UL;

output[0] = __byte_perm(input[0], input[4], perm);
output[2] = __byte_perm(input[1], input[5], perm);
output[8] = __byte_perm(input[2], input[6], perm);
output[10] = __byte_perm(input[3], input[7], perm);

SWAP1(output[0], output[2]);
SWAP1(output[8], output[10]);
SWAP1(output[0], output[2], m1);
SWAP1(output[8], output[10], m1);

SWAP2(output[0], output[8]);
SWAP2(output[2], output[10]);
SWAP2(output[0], output[8], m2);
SWAP2(output[2], output[10], m2);

output[4] = __byte_perm(output[0], output[8], 0x5410);
output[8] = __byte_perm(output[0], output[8], 0x7632);
Expand All @@ -113,8 +125,8 @@ void from_bitslice_quad(const uint32_t *const __restrict__ input, uint32_t *cons
output[10] = __byte_perm(output[2], output[10], 0x7632);
output[2] = output[6];

SWAP4(output[0], output[8]);
SWAP4(output[2], output[10]);
SWAP4(output[0], output[8], m4);
SWAP4(output[2], output[10], m4);

if (threadIdx.x & 1)
{
Expand Down Expand Up @@ -162,16 +174,21 @@ void from_bitslice_quad_final(const uint32_t *const __restrict__ input, uint32_t
uint32_t t;
const uint32_t perm = 0x7531;//(threadIdx.x & 1) ? 0x3175 : 0x7531;


register uint32_t m1 = 0xaaaaaaaaUL;
register uint32_t m2 = 0xccccccccUL;
register uint32_t m4 = 0xf0f0f0f0UL;

if (threadIdx.x & 3)
{

output[0] = __byte_perm(input[0], input[4], perm);
output[2] = __byte_perm(input[1], input[5], perm);
output[8] = __byte_perm(input[2], input[6], perm);
output[10] = __byte_perm(input[3], input[7], perm);
SWAP1(output[0], output[2]);
SWAP1(output[8], output[10]);
SWAP2(output[2], output[10]);
SWAP1(output[0], output[2],m1);
SWAP1(output[8], output[10], m1);
SWAP2(output[2], output[10], m2);
output[6] = __byte_perm(output[2], output[10], 0x5410);
output[10] = __byte_perm(output[2], output[10], 0x7632);
SWAP4_final(output[6], output[10]);
Expand All @@ -183,11 +200,11 @@ void from_bitslice_quad_final(const uint32_t *const __restrict__ input, uint32_t
output[8] = __byte_perm(input[2], input[6], perm);
output[10] = __byte_perm(input[3], input[7], perm);

SWAP1(output[0], output[2]);
SWAP1(output[8], output[10]);
SWAP1(output[0], output[2],m1);
SWAP1(output[8], output[10], m1);

SWAP2(output[0], output[8]);
SWAP2(output[2], output[10]);
SWAP2(output[0], output[8], m2);
SWAP2(output[2], output[10], m2);

output[4] = __byte_perm(output[0], output[8], 0x5410);
output[8] = __byte_perm(output[0], output[8], 0x7632);
Expand All @@ -197,8 +214,8 @@ void from_bitslice_quad_final(const uint32_t *const __restrict__ input, uint32_t
output[10] = __byte_perm(output[2], output[10], 0x7632);
output[2] = output[6];

SWAP4(output[0], output[8]);
SWAP4(output[2], output[10]);
SWAP4(output[0], output[8], m4);
SWAP4(output[2], output[10], m4);

if (threadIdx.x & 1)
{
Expand Down
6 changes: 3 additions & 3 deletions quark/cuda_quark_groestl512.cu
Expand Up @@ -16,9 +16,9 @@
__global__ __launch_bounds__(TPB, 2)
void quark_groestl512_gpu_hash_64_quad(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector)
{
uint32_t msgBitsliced[8];
uint32_t state[8];
uint32_t hash[16];
uint32_t __align__(16) msgBitsliced[8];
uint32_t __align__(16) state[8];
uint32_t __align__(16) hash[16];
// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
if (thread < threads)
Expand Down

0 comments on commit 69d05d7

Please sign in to comment.