Reduce memory buffers in simd to increase the intensity

sp-hash · Aug 5, 2018 · f462f8d · f462f8d
1 parent 6e25c8c
commit f462f8d
Show file tree

Hide file tree

Showing 2 changed files with 289 additions and 52 deletions.
diff --git a/x16r/x16r.cu b/x16r/x16r.cu
@@ -44,6 +44,7 @@ extern void x14_shabal512_cpu_hash_64_final_sp(int thr_id, uint32_t threads, uin
 extern void x17_sha512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *resNonce, const uint64_t target);
 extern void x11_shavite512_cpu_hash_64_sp_final(int thr_id, uint32_t threads, uint32_t *d_hash, const uint64_t target, uint32_t* resNonce);
 extern void quark_groestl512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target);
+extern void quark_skein512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint64_t target, uint32_t *d_resNonce);
 extern void x16_simd_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
 extern void x11_cubehash_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
 extern void quark_blake512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_nonceVector, uint32_t *d_outputHash, uint32_t *resNonce, const uint64_t target);
@@ -263,11 +264,59 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
 	const int dev_id = device_map[thr_id];
-	int intensity = (device_sm[dev_id] > 500 ) ? 20 : 18;
+/*	int intensity = (device_sm[dev_id] > 500 ) ? 20 : 18;
 	if (strstr(device_name[dev_id], "GTX 1080")) intensity = 21;
 	if (strstr(device_name[dev_id], "GTX 1060")) intensity = 19;
 	if (strstr(device_name[dev_id], "GTX 1050")) intensity = 18;
 	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+*/
+	uint32_t default_throughput=1<<19;
+	bool splitsimd = true;
+	bool merge = false;
+	if ((strstr(device_name[dev_id], "1060")) || (strstr(device_name[dev_id], "P106")))
+	{
+		default_throughput = (1 << 21);
+		splitsimd = false;
+	}
+	else if ((strstr(device_name[dev_id], "970") || (strstr(device_name[dev_id], "980"))))
+	{
+		default_throughput = (1 << 21);
+		splitsimd = false;
+	}
+	else if ((strstr(device_name[dev_id], "1050")))
+	{
+		default_throughput = 1 << 20;
+		splitsimd = false;
+	}
+	else if ((strstr(device_name[dev_id], "950")))
+	{
+		default_throughput = 1 << 20;
+		splitsimd = false;
+	}
+	else if ((strstr(device_name[dev_id], "960")))
+	{
+		default_throughput = 1 << 20;
+		splitsimd = false;
+	}
+	else if ((strstr(device_name[dev_id], "750")))
+	{
+		default_throughput = 1 << 20;
+		splitsimd = false;
+	}
+	else if ((strstr(device_name[dev_id], "1070")) || (strstr(device_name[dev_id], "P104")))
+	{
+		default_throughput = (1 << 24); //53686272; //1 << 20
+		merge = true;
+	}
+	else if ((strstr(device_name[dev_id], "1080 Ti")) || (strstr(device_name[dev_id], "1080")) || (strstr(device_name[dev_id], "P102")))
+	{
+		default_throughput = (1 << 24); //53686272; //1 << 20
+		merge = true;
+	}
+	uint32_t throughput = cuda_default_throughput(thr_id, default_throughput);
+
+	if (throughput > (1<<22)) splitsimd = true;
+
 	if (init[thr_id])
 	{
 		throughput = min(throughput, max_nonce - first_nonce);
@@ -294,7 +343,14 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,
 		quark_jh512_cpu_init(thr_id, throughput);
 		quark_keccak512_cpu_init(thr_id, throughput);
 		x11_shavite512_cpu_init(thr_id, throughput);
-		x11_simd512_cpu_init(thr_id, throughput); // 64
+		if (splitsimd)
+		{
+			x11_simd512_cpu_init(thr_id, (throughput >> 4));
+		}
+		else
+		{
+			x11_simd512_cpu_init(thr_id, (throughput));
+		}
 		x13_hamsi512_cpu_init(thr_id, throughput);
 		x16_echo512_cuda_init(thr_id, throughput);
 		x16_fugue512_cpu_init(thr_id, throughput);
@@ -323,10 +379,10 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,
 		((uint32_t*)ptarget)[7] = 0x003ff;
 //		((uint32_t*)pdata)[1] = 0xFEDCBA98;
 //		((uint32_t*)pdata)[2] = 0x76543210;
-//		((uint32_t*)pdata)[1] = 0x9A9A9A9A;
-//		((uint32_t*)pdata)[2] = 0x9A9A9A9A;
-		((uint32_t*)pdata)[1] = 0x22222222;
-		((uint32_t*)pdata)[2] = 0x22222222;
+		((uint32_t*)pdata)[1] = 0x55555555;
+		((uint32_t*)pdata)[2] = 0x55555555;
+//		((uint32_t*)pdata)[1] = 0x01234567;
+//		((uint32_t*)pdata)[2] = 0x22222222;
 //		((uint32_t*)pdata)[1] = 0x01234567;
 //		((uint32_t*)pdata)[2] = 0x89ABCDEF;
 		//((uint8_t*)pdata)[8] = 0x90; // hashOrder[0] = '9'; for simd 80 + blake512 64
@@ -541,7 +597,17 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,
 				quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); order++;
 				break;
 			case SKEIN:
-				quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				if (i == 15)
+				{
+					quark_skein512_cpu_hash_64_final(thr_id, throughput, d_hash[thr_id], ((uint64_t *)ptarget)[3], d_resNonce[thr_id]);
+					CUDA_SAFE_CALL(cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
+					work->nonces[0] = h_resNonce[thr_id][0];
+					addstart = true;
+				}
+				else
+				{
+					quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				}
 				break;
 			case LUFFA:
 				if (i == 15)
@@ -581,29 +647,74 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,
 				}
 				break;
 			case SIMD:
-				if (nextalgo == ECHO)
-				{
-					x16_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id] );
-					i = i + 1;
-				}
-				else if (nextalgo == WHIRLPOOL)
-				{
-					x16_simd_whirlpool512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
-					i = i + 1;
-				}
-				else if (nextalgo == HAMSI)
-				{
-					x16_simd_hamsi512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
-					i = i + 1;
-				}
-				else if (nextalgo == FUGUE)
+				if (!splitsimd)
 				{
-					x16_simd_fugue512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
-					i = i + 1;
+					if (nextalgo == ECHO)
+					{
+						x16_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+						i = i + 1;
+					}
+					else if (nextalgo == WHIRLPOOL)
+					{
+						x16_simd_whirlpool512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+						i = i + 1;
+					}
+					else if (nextalgo == HAMSI)
+					{
+						x16_simd_hamsi512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+						i = i + 1;
+					}
+					else if (nextalgo == FUGUE)
+					{
+						x16_simd_fugue512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+						i = i + 1;
+					}
+					else
+					{
+						x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order);
+					}
 				}
 				else
 				{
-					x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id],order);
+					if (nextalgo == ECHO)
+					{
+						for (int j = 0; j < 256; j += 16)
+						{
+							x16_simd_echo512_cpu_hash_64(thr_id, throughput >> 4, d_hash[thr_id] + (((throughput / 4)*j) / (sizeof(int))));
+						}
+						i = i + 1;
+					}
+					else if (nextalgo == WHIRLPOOL)
+					{
+						for (int j = 0; j < 256; j += 16)
+						{
+							x16_simd_whirlpool512_cpu_hash_64(thr_id, throughput >> 4, d_hash[thr_id] + (((throughput / 4)*j) / (sizeof(int))));
+						}
+						i = i + 1;
+					}
+					else if (nextalgo == HAMSI)
+					{
+						for (int j = 0; j < 256; j += 16)
+						{
+							x16_simd_hamsi512_cpu_hash_64(thr_id, throughput >> 4, d_hash[thr_id] + (((throughput / 4)*j) / (sizeof(int))));
+						}
+						i = i + 1;
+					}
+					else if (nextalgo == FUGUE)
+					{
+						for (int j = 0; j < 256; j += 16)
+						{
+							x16_simd_fugue512_cpu_hash_64(thr_id, throughput >> 4, d_hash[thr_id] + (((throughput / 4)*j) / (sizeof(int))));
+						}
+						i = i + 1;
+					}
+					else
+					{
+						for (int j = 0; j < 256; j += 16)
+						{
+							x11_simd512_cpu_hash_64(thr_id, throughput >> 4, pdata[19], NULL, d_hash[thr_id] + (((throughput / 4)*j) / (sizeof(int))),order);
+						}
+					}
 				}
 				break;
 			case ECHO: