From 69d05d7f3b82cf07c062fa6baac0aae340ef0531 Mon Sep 17 00:00:00 2001
From: sp-hash <rune.stensland@bradyplc.com>
Date: Wed, 20 Apr 2016 19:00:14 +0200
Subject: [PATCH] faster groestl-512

---
 bitslice_transformations_quad.cu | 121 ++++++++++++++++++-------------
 quark/cuda_quark_groestl512.cu   |   6 +-
 2 files changed, 72 insertions(+), 55 deletions(-)

diff --git a/bitslice_transformations_quad.cu b/bitslice_transformations_quad.cu
index 8dfdb4316..a174ec31c 100644
--- a/bitslice_transformations_quad.cu
+++ b/bitslice_transformations_quad.cu
@@ -14,13 +14,35 @@
 		x=__byte_perm(x, y, 0x5410); \
 		y=__byte_perm(x, y, 0x7632);
 
-#define SWAP4(x,y)\
-		t = (y<<4); \
-		t = (x ^ t); \
-		t = 0xf0f0f0f0UL & t; \
-		x = (x ^ t); \
-		t=  t>>4;\
-		y=  y ^ t;
+__device__ __forceinline__ void SWAP4(uint32_t &x, uint32_t &y, uint32_t m)
+{
+		uint32_t t = (y << 4);	
+		t = (x ^ t) & m;
+	//	asm("lop3.b32 %0, %1, %2, %3, 0x28;" : "=r"(t) : "r"(x), "r"(t), "r"(m)); //0x28 = (0xF0 ^ 0xCC) & 0xAA 
+		x = (x ^ t);
+		t = t >> 4;
+		y = y ^ t;
+}
+
+__device__ __forceinline__ void SWAP2(uint32_t &x, uint32_t &y, uint32_t m)
+{
+	uint32_t t = (y << 2);
+	t = (x ^ t) & m;
+	//asm("lop3.b32 %0, %1, %2, %3, 0x28;" : "=r"(t) : "r"(x), "r"(t), "r"(m)); //0x28 = (0xF0 ^ 0xCC) & 0xAA 
+	x = (x ^ t);
+	t = t >> 2;
+	y = y ^ t;
+}
+
+__device__ __forceinline__ void SWAP1(uint32_t &x, uint32_t &y, uint32_t m)
+{
+	uint32_t t = (y << 1);
+	t = (x ^ t) & m;
+//	asm("lop3.b32 %0, %1, %2, %3, 0x28;" : "=r"(t) : "r"(x), "r"(t), "r"(m)); //0x28 = (0xF0 ^ 0xCC) & 0xAA 
+	x = (x ^ t);
+	t = t >> 1;
+	y = y ^ t;
+}
 
 #define SWAP4_final(x,y)\
 	asm("and.b32 %0, %0, 0x0f0f0f0f;"\
@@ -28,24 +50,6 @@
 	 "vshl.u32.u32.u32.clamp.add %0, %1, 4, %0;\n\t"\
 	: "+r"(x) : "r"(y));\
 
-
-
-#define SWAP2(x,y)\
-		t = (y<<2); \
-		t = (x ^ t); \
-		t = 0xccccccccUL & t; \
-		x = (x ^ t); \
-		t=  t>>2;\
-		y=  y ^ t;
-
-#define SWAP1(x,y)\
-		t = (y+y); \
-		t = (x ^ t); \
-		t = 0xaaaaaaaaUL & t; \
-		x = (x ^ t); \
-		t=  t>>1;\
-		y=  y ^ t;
-
 __device__ __forceinline__
 void to_bitslice_quad(uint32_t *const __restrict__ input, uint32_t *const __restrict__ output)
 {
@@ -62,6 +66,10 @@ void to_bitslice_quad(uint32_t *const __restrict__ input, uint32_t *const __rest
 			input[i] = __shfl((int)input[i], threadIdx.x & 2, 4);
 			other[i] = __shfl((int)other[i], threadIdx.x & 2, 4);
 		}
+		register uint32_t m1 = 0xaaaaaaaaUL;
+		register uint32_t m2 = 0xccccccccUL;
+		register uint32_t m4 = 0xf0f0f0f0UL;
+
 
 		merge8(output[0], input[0], input[4], perm);
 		merge8(output[1], other[0], other[4], perm);
@@ -72,20 +80,20 @@ void to_bitslice_quad(uint32_t *const __restrict__ input, uint32_t *const __rest
 		merge8(output[6], input[3], input[7], perm);
 		merge8(output[7], other[3], other[7], perm);
 
-		SWAP1(output[0], output[1]);
-		SWAP1(output[2], output[3]);
-		SWAP1(output[4], output[5]);
-		SWAP1(output[6], output[7]);
+		SWAP1(output[0], output[1],m1);
+		SWAP1(output[2], output[3], m1);
+		SWAP1(output[4], output[5], m1);
+		SWAP1(output[6], output[7], m1);
 
-		SWAP2(output[0], output[2]);
-		SWAP2(output[1], output[3]);
-		SWAP2(output[4], output[6]);
-		SWAP2(output[5], output[7]);
+		SWAP2(output[0], output[2], m2);
+		SWAP2(output[1], output[3], m2);
+		SWAP2(output[4], output[6], m2);
+		SWAP2(output[5], output[7], m2);
 
-		SWAP4(output[0], output[4]);
-		SWAP4(output[1], output[5]);
-		SWAP4(output[2], output[6]);
-		SWAP4(output[3], output[7]);
+		SWAP4(output[0], output[4], m4);
+		SWAP4(output[1], output[5], m4);
+		SWAP4(output[2], output[6], m4);
+		SWAP4(output[3], output[7], m4);
 }
 
 __device__ __forceinline__
@@ -94,16 +102,20 @@ void from_bitslice_quad(const uint32_t *const __restrict__ input, uint32_t *cons
 	uint32_t t;
 	const uint32_t perm = 0x7531;//(threadIdx.x & 1) ? 0x3175 : 0x7531;
 
+	register uint32_t m1 = 0xaaaaaaaaUL;
+	register uint32_t m2 = 0xccccccccUL;
+	register uint32_t m4 = 0xf0f0f0f0UL;
+
 		output[0] = __byte_perm(input[0], input[4], perm);
 		output[2] = __byte_perm(input[1], input[5], perm);
 		output[8] = __byte_perm(input[2], input[6], perm);
 		output[10] = __byte_perm(input[3], input[7], perm);
 
-		SWAP1(output[0], output[2]);
-		SWAP1(output[8], output[10]);
+		SWAP1(output[0], output[2], m1);
+		SWAP1(output[8], output[10], m1);
 
-		SWAP2(output[0], output[8]);
-		SWAP2(output[2], output[10]);
+		SWAP2(output[0], output[8], m2);
+		SWAP2(output[2], output[10], m2);
 
 		output[4] = __byte_perm(output[0], output[8], 0x5410);
 		output[8] = __byte_perm(output[0], output[8], 0x7632);
@@ -113,8 +125,8 @@ void from_bitslice_quad(const uint32_t *const __restrict__ input, uint32_t *cons
 		output[10] = __byte_perm(output[2], output[10], 0x7632);
 		output[2] = output[6];
 
-		SWAP4(output[0], output[8]);
-		SWAP4(output[2], output[10]);
+		SWAP4(output[0], output[8], m4);
+		SWAP4(output[2], output[10], m4);
 
 		if (threadIdx.x & 1)
 		{
@@ -162,6 +174,11 @@ void from_bitslice_quad_final(const uint32_t *const __restrict__ input, uint32_t
 	uint32_t t;
 	const uint32_t perm = 0x7531;//(threadIdx.x & 1) ? 0x3175 : 0x7531;
 
+
+	register uint32_t m1 = 0xaaaaaaaaUL;
+	register uint32_t m2 = 0xccccccccUL;
+	register uint32_t m4 = 0xf0f0f0f0UL;
+
 	if (threadIdx.x & 3)
 	{
 
@@ -169,9 +186,9 @@ void from_bitslice_quad_final(const uint32_t *const __restrict__ input, uint32_t
 		output[2] = __byte_perm(input[1], input[5], perm);
 		output[8] = __byte_perm(input[2], input[6], perm);
 		output[10] = __byte_perm(input[3], input[7], perm);
-		SWAP1(output[0], output[2]);
-		SWAP1(output[8], output[10]);
-		SWAP2(output[2], output[10]);
+		SWAP1(output[0], output[2],m1);
+		SWAP1(output[8], output[10], m1);
+		SWAP2(output[2], output[10], m2);
 		output[6] = __byte_perm(output[2], output[10], 0x5410);
 		output[10] = __byte_perm(output[2], output[10], 0x7632);
 		SWAP4_final(output[6], output[10]);
@@ -183,11 +200,11 @@ void from_bitslice_quad_final(const uint32_t *const __restrict__ input, uint32_t
 		output[8] = __byte_perm(input[2], input[6], perm);
 		output[10] = __byte_perm(input[3], input[7], perm);
 
-		SWAP1(output[0], output[2]);
-		SWAP1(output[8], output[10]);
+		SWAP1(output[0], output[2],m1);
+		SWAP1(output[8], output[10], m1);
 
-		SWAP2(output[0], output[8]);
-		SWAP2(output[2], output[10]);
+		SWAP2(output[0], output[8], m2);
+		SWAP2(output[2], output[10], m2);
 
 		output[4] = __byte_perm(output[0], output[8], 0x5410);
 		output[8] = __byte_perm(output[0], output[8], 0x7632);
@@ -197,8 +214,8 @@ void from_bitslice_quad_final(const uint32_t *const __restrict__ input, uint32_t
 		output[10] = __byte_perm(output[2], output[10], 0x7632);
 		output[2] = output[6];
 
-		SWAP4(output[0], output[8]);
-		SWAP4(output[2], output[10]);
+		SWAP4(output[0], output[8], m4);
+		SWAP4(output[2], output[10], m4);
 
 		if (threadIdx.x & 1)
 		{
diff --git a/quark/cuda_quark_groestl512.cu b/quark/cuda_quark_groestl512.cu
index ee5aa8b11..0ca56175c 100644
--- a/quark/cuda_quark_groestl512.cu
+++ b/quark/cuda_quark_groestl512.cu
@@ -16,9 +16,9 @@
 __global__ __launch_bounds__(TPB, 2)
 void quark_groestl512_gpu_hash_64_quad(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector)
 {
-	uint32_t msgBitsliced[8];
-	uint32_t state[8];
-	uint32_t hash[16];
+	uint32_t __align__(16) msgBitsliced[8];
+	uint32_t __align__(16) state[8];
+	uint32_t __align__(16) hash[16];
 	// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
     uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
     if (thread < threads)