From klaus_T: faster lyra2 (+60khash gtx 970)

sp-hash · Jun 11, 2015 · 2b2a3bd · 2b2a3bd
1 parent 237ec7d
commit 2b2a3bd
Show file tree

Hide file tree

Showing 3 changed files with 108 additions and 74 deletions.
diff --git a/Algo256/cuda_skein256.cu b/Algo256/cuda_skein256.cu
@@ -5,86 +5,103 @@
 __forceinline__ __device__
 void Round512v35(uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7, const int ROT1, const int ROT2, const int ROT3, const int ROT4)
 {
-	p0 += p1; p1 = ROL2(p1, ROT1);  p1 ^= p0;
-	p2 += p3; p3 = ROL2(p3, ROT2);  p3 ^= p2;
-	p4 += p5; p5 = ROL2(p5, ROT3);  p5 ^= p4;
-	p6 += p7; p7 = ROL2(p7, ROT4);  p7 ^= p6;
+	p0 += p1; p1 = ROL2(p1, ROT1) ^ p0;
+	p2 += p3; p3 = ROL2(p3, ROT2) ^ p2;
+	p4 += p5; p5 = ROL2(p5, ROT3) ^ p4;
+	p6 += p7; p7 = ROL2(p7, ROT4) ^ p6;
 }
 
 __forceinline__ __device__
-void Round_8_512v35(uint2 *ks, uint2 *ts,
+void Round_8_512v35(const uint2 *const __restrict__ ks, const uint2 *const __restrict__ ts,
                     uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3,
                     uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7, int R)
 {
 	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
 	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
 	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
 	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
-	p0 += ks[((R)+0) % 9];   /* inject the key schedule value */
-	p1 += ks[((R)+1) % 9];
-	p2 += ks[((R)+2) % 9];
-	p3 += ks[((R)+3) % 9];
-	p4 += ks[((R)+4) % 9];
-	p5 += ks[((R)+5) % 9] + ts[((R)+0) % 3];
-	p6 += ks[((R)+6) % 9] + ts[((R)+1) % 3];
-	p7 += ks[((R)+7) % 9] + make_uint2((R),0);
+	p0 += ks[(R+0) % 9];   /* inject the key schedule value */
+	p1 += ks[(R+1) % 9];
+	p2 += ks[(R+2) % 9];
+	p3 += ks[(R+3) % 9];
+	p4 += ks[(R+4) % 9];
+	p5 += ks[(R+5) % 9] + ts[(R+0) % 3];
+	p6 += ks[(R+6) % 9] + ts[(R+1) % 3];
+	p7 += ks[(R+7) % 9] + make_uint2(R,0);
 	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
 	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
 	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
 	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
-	p0 += ks[((R)+1) % 9];   /* inject the key schedule value */
-	p1 += ks[((R)+2) % 9];
-	p2 += ks[((R)+3) % 9];
-	p3 += ks[((R)+4) % 9];
-	p4 += ks[((R)+5) % 9];
-	p5 += ks[((R)+6) % 9] + ts[((R)+1) % 3];
-	p6 += ks[((R)+7) % 9] + ts[((R)+2) % 3];
-	p7 += ks[((R)+8) % 9] + make_uint2((R)+1, 0);
+	p0 += ks[(R+1) % 9];   /* inject the key schedule value */
+	p1 += ks[(R+2) % 9];
+	p2 += ks[(R+3) % 9];
+	p3 += ks[(R+4) % 9];
+	p4 += ks[(R+5) % 9];
+	p5 += ks[(R+6) % 9] + ts[(R+1) % 3];
+	p6 += ks[(R+7) % 9] + ts[(R+2) % 3];
+	p7 += ks[(R+8) % 9] + make_uint2(R+1, 0);
 }
 
+__forceinline__ __device__
+void Round_8_512v35_final(const uint2 *const __restrict__ ks, const uint2 *const __restrict__ ts,
+uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3,
+uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
+{
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
+	p0 += ks[8];   /* inject the key schedule value */
+	p1 += ks[0];
+	p2 += ks[1];
+	p3 += ks[2];
+	p4 += ks[3];
+	p5 += ks[4] + ts[2];
+	p6 += ks[5] + ts[0];
+	p7 += ks[6] + make_uint2(17, 0);
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
+	p0 += ks[0];   /* inject the key schedule value */
+	p1 += ks[1];
+	p2 += ks[2];
+	p3 += ks[3];
+}
 
 __global__ __launch_bounds__(256,3)
 void skein256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash)
 {
-	const	uint2 vSKEIN_IV512_256[8] = {
-			{ 0x2FDB3E13, 0xCCD044A1 },
-			{ 0x1A79A9EB, 0xE8359030 },
-			{ 0x4F816E6F, 0x55AEA061 },
-			{ 0xAE9B94DB, 0x2A2767A4 },
-			{ 0x74DD7683, 0xEC06025E },
-			{ 0xC4746251, 0xE7A436CD },
-			{ 0x393AD185, 0xC36FBAF9 },
-			{ 0x33EDFC13, 0x3EEDBA18 }
-	};
-	const uint2 skein_ks_parity = { 0xA9FC1A22, 0x1BD11BDA };
-	const uint2 t12[6] = {
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	if (thread < threads)
+	{
+		const uint2 skein_ks_parity = { 0xA9FC1A22, 0x1BD11BDA };
+		const uint2 t12[6] =
+		{
 			{ 0x20, 0 },
 			{ 0, 0xf0000000 },
 			{ 0x20, 0xf0000000 },
 			{ 0x08, 0 },
 			{ 0, 0xff000000 },
 			{ 0x08, 0xff000000 }
-	};
-
+		};
 
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
-	{
-		uint2 h[9];
-		uint2 t[3];
+		uint2 h[9] =
+		{
+			{ 0x2FDB3E13, 0xCCD044A1 },
+			{ 0x1A79A9EB, 0xE8359030 },
+			{ 0x4F816E6F, 0x55AEA061 },
+			{ 0xAE9B94DB, 0x2A2767A4 },
+			{ 0x74DD7683, 0xEC06025E },
+			{ 0xC4746251, 0xE7A436CD },
+			{ 0x393AD185, 0xC36FBAF9 },
+			{ 0x33EDFC13, 0x3EEDBA18 },
+			{ 0xC73A4E2A, 0xB69D3CFC }
+		};
 		uint2 dt0,dt1,dt2,dt3;
 		uint2 p0, p1, p2, p3, p4, p5, p6, p7;
 
-		h[8] = skein_ks_parity;
-		for (int i = 0; i<8; i++) {
-			h[i] = vSKEIN_IV512_256[i];
-			h[8] ^= h[i];
-		}
-
-		t[0]=t12[0];
-		t[1]=t12[1];
-		t[2]=t12[2];
-
 		LOHI(dt0.x,dt0.y,outputHash[thread]);
 		LOHI(dt1.x,dt1.y,outputHash[threads+thread]);
 		LOHI(dt2.x,dt2.y,outputHash[2*threads+thread]);
@@ -95,13 +112,14 @@ void skein256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outp
 		p2 = h[2] + dt2;
 		p3 = h[3] + dt3;
 		p4 = h[4];
-		p5 = h[5] + t[0];
-		p6 = h[6] + t[1];
+		p5 = h[5] + t12[0];
+		p6 = h[6] + t12[1];
 		p7 = h[7];
 
 		#pragma unroll
-		for (int i = 1; i<19; i+=2) {
-			Round_8_512v35(h,t,p0,p1,p2,p3,p4,p5,p6,p7,i);
+		for (int i = 1; i<19; i+=2)
+		{
+			Round_8_512v35(h,t12,p0,p1,p2,p3,p4,p5,p6,p7,i);
 		}
 
 		p0 ^= dt0;
@@ -117,24 +135,18 @@ void skein256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outp
 		h[5] = p5;
 		h[6] = p6;
 		h[7] = p7;
-		h[8] = skein_ks_parity;
-
-		#pragma unroll 8
-		for (int i = 0; i<8; i++) {
-			h[8] ^= h[i];
-		}
+		h[8] = skein_ks_parity ^ h[0] ^ h[1] ^ h[2] ^ h[3] ^ h[4] ^ h[5] ^ h[6] ^ h[7];
 
-		t[0] = t12[3];
-		t[1] = t12[4];
-		t[2] = t12[5];
-		p5 += t[0];  //p5 already equal h[5]
-		p6 += t[1];
+		const uint2 *t = t12+3;
+		p5 += t12[3];  //p5 already equal h[5]
+		p6 += t12[4];
 
 		#pragma unroll
-		for (int i = 1; i<19; i+=2) {
+		for (int i = 1; i<17; i+=2)
+		{
 			Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, i);
 		}
-
+		Round_8_512v35_final(h, t, p0, p1, p2, p3, p4, p5, p6, p7);
 		outputHash[thread]           = devectorize(p0);
 		outputHash[threads+thread]   = devectorize(p1);
 		outputHash[2*threads+thread] = devectorize(p2);
@@ -156,7 +168,7 @@ void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, ui
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	skein256_gpu_hash_32<<<grid, block>>>(threads, startNounce, d_outputHash);
+	skein256_gpu_hash_32<<<grid, block, 0, gpustream[thr_id]>>>(threads, startNounce, d_outputHash);
 
 }
 
diff --git a/lyra2/cuda_lyra2.cu b/lyra2/cuda_lyra2.cu
@@ -28,10 +28,13 @@ static __device__ __forceinline__ void round_lyra_v35(uint2 *s)
 
 __device__ __forceinline__ void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2 state[16], uint2 Matrix[96][8])
 { 
-
-	for (int i = 0; i < 8*12; i+=12)
+#if __CUDA_ARCH__ > 500
+	#pragma unroll
+	for (int i = 0; i < 8 * 12; i += 12)
+#else
+	for (int i = 0; i < 8 * 12; i += 12)
+#endif	
 	{
-		#pragma unroll
 		for (int j = 0; j < 12; j++)
 			state[j] ^= Matrix[i + j][rowIn] + Matrix[i + j][rowInOut];
 		round_lyra_v35(state);

diff --git a/lyra2/lyra2RE.cu b/lyra2/lyra2RE.cu
@@ -64,6 +64,28 @@ extern "C" int scanhash_lyra2(int thr_id, uint32_t *pdata,
 {
 	const uint32_t first_nonce = pdata[19];
 	unsigned int intensity = (device_sm[device_map[thr_id]] > 500) ? 256 * 256 * 25 : 256 * 256 * 19;
+	cudaDeviceProp props;
+	cudaGetDeviceProperties(&props, device_map[thr_id]);
+	if (strstr(props.name, "970"))
+	{
+		intensity = 256 * 256 * 26;
+	}
+	else if (strstr(props.name, "980"))
+	{
+		intensity = 256 * 256 * 26;
+	}
+	else if (strstr(props.name, "750 Ti"))
+	{
+		intensity = 256 * 256 * 19;
+	}
+	else if (strstr(props.name, "750"))
+	{
+		intensity = 256 * 256 * 19;
+	}
+	else if (strstr(props.name, "960"))
+	{
+		intensity = 256 * 256 * 20;
+	}
 	uint32_t throughput = device_intensity(device_map[thr_id], __func__, intensity); // 18=256*256*4;
 	throughput = min(throughput, (max_nonce - first_nonce));
 
@@ -103,7 +125,6 @@ extern "C" int scanhash_lyra2(int thr_id, uint32_t *pdata,
 		keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id]);
-//		MyStreamSynchronize(NULL, 2, thr_id);
 		groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonce);
 		if (foundNonce[0] != 0)
 		{
@@ -126,7 +147,6 @@ extern "C" int scanhash_lyra2(int thr_id, uint32_t *pdata,
 				}
 				pdata[19] = foundNonce[0];
 				if (opt_benchmark) applog(LOG_INFO, "GPU #%d Found nounce % 08x", thr_id, foundNonce[0], vhash64[7], Htarg);
-				MyStreamSynchronize(NULL, NULL, device_map[thr_id]);
 				return res;
 			}
 			else
@@ -140,6 +160,5 @@ extern "C" int scanhash_lyra2(int thr_id, uint32_t *pdata,
 	} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
 
 	*hashes_done = pdata[19] - first_nonce + 1;
-	MyStreamSynchronize(NULL, NULL, device_map[thr_id]);
 	return 0;
 }