Bether default intensities quark on the 980ti

sp-hash · Apr 2, 2016 · 3a726d9 · 3a726d9
1 parent 6848402
commit 3a726d9
Show file tree

Hide file tree

Showing 13 changed files with 1,172 additions and 175 deletions.
diff --git a/Algo256/decred_ok.cu b/Algo256/decred_ok.cu
diff --git a/bitslice_transformations_quad.cu b/bitslice_transformations_quad.cu
@@ -176,8 +176,7 @@ void from_bitslice_quad_final(const uint32_t *const __restrict__ input, uint32_t
 		output[10] = __byte_perm(output[2], output[10], 0x7632);
 		SWAP4_final(output[6], output[10]);
 		output[6] = __byte_perm(output[6], 0, 0x3232);
-	}
-	else
+	} else
 	{
 		output[0] = __byte_perm(input[0], input[4], perm);
 		output[2] = __byte_perm(input[1], input[5], perm);
@@ -212,8 +211,7 @@ void from_bitslice_quad_final(const uint32_t *const __restrict__ input, uint32_t
 			output[2] = __byte_perm(output[2], 0, 0x1032);
 			output[8] = __byte_perm(output[8], 0, 0x1032);
 			output[10] = __byte_perm(output[10], 0, 0x1032);
-		}
-		else
+		}else
 		{
 			output[4] = output[0];
 			output[6] = output[2];

diff --git a/ccminer.cpp b/ccminer.cpp
@@ -2705,6 +2705,8 @@ int main(int argc, char *argv[])
 	opt_syslog_pfx = strdup(PROGRAM_NAME);
 	opt_api_allow = strdup("127.0.0.1"); /* 0.0.0.0 for all ips */
 
+	printf("\t\t\tSP-Mod Private #6 \n");
+
 #ifdef _MSC_VER
 	printf("Compiled with Visual C++ %d ", _MSC_VER / 100);
 #else

diff --git a/ccminer.v12.suo b/ccminer.v12.suo
diff --git a/keccak/cuda_keccak256.cu b/keccak/cuda_keccak256.cu
diff --git a/keccak/keccak256.cu b/keccak/keccak256.cu
@@ -14,11 +14,11 @@ extern "C"
 
 #include "cuda_helper.h"
 
-static uint32_t *d_hash[8];
+static uint32_t h_nounce[MAX_GPUS][2];
 
-extern void keccak256_cpu_init(int thr_id, int threads);
-extern void keccak256_setBlock_80(void *pdata,const void *ptarget);
-extern uint32_t keccak256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void keccak256_cpu_init(int thr_id, uint32_t threads);
+extern void keccak256_setBlock_80(void *pdata,const uint64_t *ptarget);
+extern void keccak256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *h_nounce);
 
 // CPU Hash
 extern "C" void keccak256_hash(void *state, const void *input)
@@ -34,25 +34,29 @@ extern "C" void keccak256_hash(void *state, const void *input)
 	memcpy(state, hash, 32);
 }
 
+static bool init[MAX_GPUS] = { 0 };
+
 extern "C" int scanhash_keccak256(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
+	uint32_t intensity = (device_sm[device_map[thr_id]] > 500) ? 1 << 28 : 1 << 27;;
+	uint32_t throughput = device_intensity(device_map[thr_id], __func__, intensity); // 256*4096
+	throughput = min(throughput, max_nonce - first_nonce);
 
-	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0005;
 
-	uint32_t throughput = opt_work_size ? opt_work_size : (1 << 21); // 256*256*8*4
-	throughput = min(throughput, (int)(max_nonce - first_nonce));
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x01;
 
-	static bool init[8] = {0,0,0,0,0,0,0,0};
 	if (!init[thr_id]) {
 		cudaSetDevice(device_map[thr_id]);
-
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
-		keccak256_cpu_init(thr_id, throughput);
-
+		if (!opt_cpumining) cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		if (opt_n_gputhreads == 1)
+		{
+			cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+		}
+		keccak256_cpu_init(thr_id, (int)throughput);
 		init[thr_id] = true;
 	}
 
@@ -61,38 +65,46 @@ extern "C" int scanhash_keccak256(int thr_id, uint32_t *pdata,
 		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
 	}
 
-	keccak256_setBlock_80((void*)endiandata, ptarget);
+	keccak256_setBlock_80((void*)endiandata, (uint64_t *)ptarget);
+
 	do {
-		int order = 0;
 
-		uint32_t foundNonce = keccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		if  (foundNonce != 0xffffffff)
+		keccak256_cpu_hash_80(thr_id, (int) throughput, pdata[19], h_nounce[thr_id]);
+		if (h_nounce[thr_id][0] != UINT32_MAX)
 		{
-			uint32_t vhash64[8];
 			uint32_t Htarg = ptarget[7];
-			be32enc(&endiandata[19], foundNonce);
+			uint32_t vhash64[8];
+			be32enc(&endiandata[19], h_nounce[thr_id][0]);
 			keccak256_hash(vhash64, endiandata);
 
-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
-
-				*hashes_done = pdata[19] + throughput - first_nonce;
-				pdata[19] = foundNonce;
-				return 1;
-
-			} else {
-				applog(LOG_DEBUG, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce);
+			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
+			{
+				int res = 1;
+				// check if there was some other ones...
+				*hashes_done = pdata[19] - first_nonce + throughput;
+				if (h_nounce[thr_id][1] != 0xffffffff)
+				{
+					pdata[21] = h_nounce[thr_id][1];
+					res++;
+					if (opt_benchmark)
+						applog(LOG_INFO, "GPU #%d Found second nounce %08x", thr_id, h_nounce[thr_id][1], vhash64[7], Htarg);
+				}
+				pdata[19] = h_nounce[thr_id][0];
+				if (opt_benchmark)
+					applog(LOG_INFO, "GPU #%d Found nounce %08x", thr_id, h_nounce[thr_id][0], vhash64[7], Htarg);
+				return res;
+			}
+			else
+			{
+				if (vhash64[7] != Htarg)
+				{
+					applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, h_nounce[thr_id][0]);
+				}
 			}
-		}
-
-		if ((uint64_t) pdata[19] + throughput > (uint64_t) max_nonce) {
-			pdata[19] = max_nonce;
-			break;
 		}
 
 		pdata[19] += throughput;
-
-	} while (!scan_abort_flag && !work_restart[thr_id].restart);
-
-	*hashes_done = pdata[19] - first_nonce + 1;
+	} while (!scan_abort_flag && !work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
+	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }
diff --git a/lyra2/cuda_lyra2v2.cu b/lyra2/cuda_lyra2v2.cu
@@ -187,7 +187,6 @@ __device__ void reduceDuplexRowtV2(const int rowIn, const int rowInOut, const in
 		#pragma unroll 
 		for (j = 0; j < 11; j++)
 		((uint2*)state2)[j + 1] ^= ((uint2*)state)[j];
-
 #if __CUDA_ARCH__ == 500
 		if (rowInOut != rowOut) 
 		{

diff --git a/lyra2/lyra2REv2.cu b/lyra2/lyra2REv2.cu
@@ -86,7 +86,7 @@ extern "C" int scanhash_lyra2v2(int thr_id, uint32_t *pdata,
 	const uint32_t first_nonce = pdata[19];
 	uint32_t intensity = 256 * 256 * 8;
 	uint32_t tpb = 8;
-	bool mergeblakekeccak = false;
+//	bool mergeblakekeccak = false;
 	cudaDeviceProp props;
 	cudaGetDeviceProperties(&props, device_map[thr_id]);
 	if (strstr(props.name, "970"))
@@ -108,13 +108,13 @@ extern "C" int scanhash_lyra2v2(int thr_id, uint32_t *pdata,
 	{
 		intensity = 256 * 256 * 10;
 		tpb = 16;
-		mergeblakekeccak = true;
+//		mergeblakekeccak = true;
 	}
 	else if (strstr(props.name, "750"))
 	{
 		intensity = 256 * 256 * 5;
 		tpb = 16;
-		mergeblakekeccak = true;
+//		mergeblakekeccak = true;
 	}
 	else if (strstr(props.name, "960"))
 	{
@@ -158,16 +158,17 @@ extern "C" int scanhash_lyra2v2(int thr_id, uint32_t *pdata,
 	do {
 		uint32_t foundNonce[2] = { 0, 0 };
 
-		if (mergeblakekeccak)
-		{
+//		if (mergeblakekeccak)
+//		{
 			blakeKeccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		}
+
+/*		}
 		else
 		{
 			blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
 			keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		}
-
+*/
 		cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		lyra2v2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], tpb);
 		skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id]);

diff --git a/quark/quarkcoin.cu b/quark/quarkcoin.cu
@@ -145,8 +145,15 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
 
 	uint32_t intensity = 256*256*57;
 	intensity = intensity + ((1 << 22));
-	if (device_sm[device_map[thr_id]] > 500) intensity= 1 << 24;
+	cudaDeviceProp props;
+	cudaGetDeviceProperties(&props, device_map[thr_id]);
 
+	if (device_sm[device_map[thr_id]] > 500) intensity= 1 << 25;
+
+	if (strstr(props.name, "980 Ti"))
+	{
+		intensity = 1 << 25;
+	}
 	uint32_t throughput = device_intensity(device_map[thr_id], __func__, intensity); // 256*4096
 	throughput = min(throughput, max_nonce - first_nonce);
 

diff --git a/x13/x13.cu b/x13/x13.cu
@@ -44,7 +44,7 @@ extern void cuda_jh512Keccak512_cpu_hash_64( uint32_t threads, uint32_t startNou
 
 
 
-extern void x11_luffaCubehash512_cpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *d_hash, uint32_t luffacubehashthreads);
+extern void x11_luffaCubehash512_cpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern void x11_shavite512_cpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *d_hash, uint32_t shavitethreads);
 
@@ -160,7 +160,6 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
 	uint32_t throughput = device_intensity(device_map[thr_id], __func__, intensity);
 	uint32_t simdthreads = (device_sm[device_map[thr_id]] > 500) ? 64 : 32;
 	uint32_t shavitethreads = (device_sm[device_map[thr_id]] == 500) ? 256 : 320;
-	uint32_t luffacubehashthreads = (device_sm[device_map[thr_id]] == 500) ? 512 : 256;
 
 	throughput = min(throughput, (max_nonce - first_nonce));
 
@@ -209,7 +208,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
 		quark_groestl512_cpu_hash_64(throughput, pdata[19], NULL, d_hash[thr_id]);
 		quark_skein512_cpu_hash_64(throughput, pdata[19], NULL, d_hash[thr_id]);
 		cuda_jh512Keccak512_cpu_hash_64(throughput, pdata[19], d_hash[thr_id]);
-		x11_luffaCubehash512_cpu_hash_64(throughput, pdata[19], d_hash[thr_id], luffacubehashthreads);
+		x11_luffaCubehash512_cpu_hash_64(throughput, pdata[19], d_hash[thr_id]);
 		x11_shavite512_cpu_hash_64(throughput, pdata[19], d_hash[thr_id], shavitethreads);
 		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id],simdthreads);
 		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]);

diff --git a/x15/x14.cu b/x15/x14.cu
@@ -43,7 +43,7 @@ extern void quark_skein512_cpu_hash_64(uint32_t threads, uint32_t startNounce, u
 
 extern void cuda_jh512Keccak512_cpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
-extern void x11_luffaCubehash512_cpu_hash_64( uint32_t threads, uint32_t startNounce, uint32_t *d_hash,uint32_t luffacubehashthreads);
+extern void x11_luffaCubehash512_cpu_hash_64( uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern void x11_shavite512_cpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *d_hash, uint32_t shavitethreads);
 
@@ -159,7 +159,6 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
 	int intensity = (device_sm[device_map[thr_id]] > 500) ? 256 * 256 * 20 : 256 * 256 * 10;
 	uint32_t simdthreads = (device_sm[device_map[thr_id]] > 500) ? 64 : 32;
 	uint32_t shavitethreads = (device_sm[device_map[thr_id]] == 500) ? 256 : 320;
-	uint32_t luffacubehashthreads = (device_sm[device_map[thr_id]] == 500) ? 512 : 256;
 
 	uint32_t throughput = device_intensity(device_map[thr_id], __func__, intensity); // 19=256*256*8;
 	throughput = min(throughput, max_nonce - first_nonce);
@@ -209,7 +208,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
 		quark_groestl512_cpu_hash_64(throughput, pdata[19], NULL, d_hash[thr_id]);
 		quark_skein512_cpu_hash_64( throughput, pdata[19], NULL, d_hash[thr_id]);
 		cuda_jh512Keccak512_cpu_hash_64(throughput, pdata[19], d_hash[thr_id]);
-		x11_luffaCubehash512_cpu_hash_64(throughput, pdata[19], d_hash[thr_id], luffacubehashthreads);
+		x11_luffaCubehash512_cpu_hash_64(throughput, pdata[19], d_hash[thr_id]);
 		x11_shavite512_cpu_hash_64(throughput, pdata[19], d_hash[thr_id], shavitethreads);
 		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id],simdthreads);
 		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]);

diff --git a/x15/x15.cu b/x15/x15.cu
@@ -47,7 +47,7 @@ extern void quark_skein512_cpu_hash_64(uint32_t threads, uint32_t startNounce, u
 //extern void quark_jh512_cpu_hash_64( uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
 extern void cuda_jh512Keccak512_cpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
-extern void x11_luffaCubehash512_cpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *d_hash, uint32_t luffacubehashthreads);
+extern void x11_luffaCubehash512_cpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 extern void x11_shavite512_cpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *d_hash, uint32_t shavitethreads);
 
 extern int  x11_simd512_cpu_init(int thr_id, uint32_t threads);
@@ -173,10 +173,9 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
 	throughput = min(throughput, (max_nonce - first_nonce));
 	uint32_t simdthreads = (device_sm[device_map[thr_id]] > 500) ? 64 : 32;
 	uint32_t shavitethreads = (device_sm[device_map[thr_id]] == 500) ? 256 : 320;
-	uint32_t luffacubehashthreads = (device_sm[device_map[thr_id]] == 500) ? 512 : 256;
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0xff;
+		((uint32_t*)ptarget)[7] = 0xf;
 
 	if (!init[thr_id])
 	{
@@ -223,7 +222,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
 		quark_skein512_cpu_hash_64(throughput, pdata[19], NULL, d_hash[thr_id]);
 		cuda_jh512Keccak512_cpu_hash_64(throughput, pdata[19], d_hash[thr_id]);
 
-		x11_luffaCubehash512_cpu_hash_64(throughput, pdata[19], d_hash[thr_id], luffacubehashthreads);
+		x11_luffaCubehash512_cpu_hash_64(throughput, pdata[19], d_hash[thr_id]);
 		x11_shavite512_cpu_hash_64(throughput, pdata[19], d_hash[thr_id],shavitethreads);
 		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id],simdthreads);
 		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19],  d_hash[thr_id]);

diff --git a/x17/x17.cu b/x17/x17.cu
@@ -47,7 +47,7 @@ extern void quark_skein512_cpu_hash_64(uint32_t threads, uint32_t startNounce, u
 
 extern void cuda_jh512Keccak512_cpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
-extern void x11_luffaCubehash512_cpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *d_hash, uint32_t luffacubehashthreads);
+extern void x11_luffaCubehash512_cpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 
 extern void x11_shavite512_cpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *d_hash, uint32_t shavitethreads);
 
@@ -186,7 +186,6 @@ extern "C" int scanhash_x17(int thr_id, uint32_t *pdata,
 	int intensity = 256 * 256 * 9;
 	uint32_t simdthreads = (device_sm[device_map[thr_id]] > 500) ? 64 : 32;
 	uint32_t shavitethreads = (device_sm[device_map[thr_id]] == 500) ? 256 : 320;
-	uint32_t luffacubehashthreads = (device_sm[device_map[thr_id]] == 500) ? 512 : 256;
 
 	if (device_sm[device_map[thr_id]] == 520)  intensity = 256 * 256 * 15;
 	uint32_t throughput = device_intensity(device_map[thr_id], __func__, intensity); // 19=256*256*8;
@@ -238,7 +237,7 @@ extern "C" int scanhash_x17(int thr_id, uint32_t *pdata,
 		quark_groestl512_cpu_hash_64(throughput, pdata[19], NULL, d_hash[thr_id]);
 		quark_skein512_cpu_hash_64(throughput, pdata[19], NULL, d_hash[thr_id]);
 		cuda_jh512Keccak512_cpu_hash_64(throughput, pdata[19], d_hash[thr_id]);
-		x11_luffaCubehash512_cpu_hash_64(throughput, pdata[19], d_hash[thr_id], luffacubehashthreads);
+		x11_luffaCubehash512_cpu_hash_64(throughput, pdata[19], d_hash[thr_id]);
 		x11_shavite512_cpu_hash_64(throughput, pdata[19], d_hash[thr_id],shavitethreads);
 		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], simdthreads);
 		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]);