Upgrade to 2.6.0-hide-2.1.0

rapid821 · Nov 18, 2018 · c96ffe3 · c96ffe3
1 parent d0bb189
commit c96ffe3
Show file tree

Hide file tree

Showing 8 changed files with 704 additions and 656 deletions.
diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 ###### fireice-uk's and psychocrypt's
 # XMR-Stak - Cryptonight All-in-One Mining Software
 
-**You must update to version [2.5.1-hide-2.0.0](https://github.com/rapid821/xmr-stak-hide/releases/tag/2.5.1-hide-2.0.0) before October 18th 2018, if you want to mine Monero.**
+**You must update to version [2.5.1-hide-2.0.0+](https://github.com/rapid821/xmr-stak-hide/releases) before October 18th 2018, if you want to mine Monero.**
 
 XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NVIDIA GPUs and can be used to mine the crypto currencies Monero, Aeon and many more Cryptonight coins.
 

diff --git a/doc/compile.md b/doc/compile.md
@@ -85,3 +85,44 @@ After the configuration you need to compile the miner, follow the guide for your
 - `XMR-STAK_THREADS` give the compiler information which value for `threads` is used at runtime
   - default is `0` (compile time optimization)
   - if the miner is compiled and used at runtime with the some value it can increase the hash rate: `cmake .. -DXMR-STAK_THREADS=32`
+
+#### CUDA Runtime versus CUDA SDK
+nVidia packages the CUDA **runtime** with the GPU drivers, and the CUDA **SDK** should match.
+While it is possible to compile with old SDK and then run on newer runtime/driver, in most cases it does not work well.
+
+SDK usually bundles a driver that supports the particular CUDA version, but it is always best to get (usually newer)
+drivers from the official site.
+
+For Example: Built with 8.0 SDK running on a 9.2 driver crashes randomly on some GPUs, however worked fine on most 9.1
+drivers.  Backward compatibility "should" work, but in reality there are many cases where it does not (YMMV)
+
+**NOTE**: The inverse case, installing CUDA 10.0 SDK on a system with older driver
+does not magically add CUDA 10.0 support to the old driver. You must build with
+CUDA SDK to match that driver runtime (check driver release notes PDF under 'supported technologies' list within the
+first several pages) - *OR* - upgrade the driver to minimum `411.63` to have the CUDA 10.0 runtime
+(unless, Fermi... they can't use CUDA 9.x or 10.0, even though newer drivers still run their *graphics* parts)
+
+Other gotchas based on GPU family:
+* Anything less than Fermi will never work
+* Fermi (arch 2x) was removed after CUDA 8.0
+* Volta (arch 7x) was added in CUDA 9.0
+* Turing (arch 75) was added in CUDA 10.0
+
+Here is a rough table of driver revisions and CUDA runtime contained:
+
+| CUDA | Driver min | Driver max | notes
+| ----:| ----------:| ----------:| -----
+| 10.0 | 411.63     | (current)  |
+|  9.2 | 397.93     | 399.24     |
+|  9.1 | 388.71     | 397.64     |
+|  9.0 | 387.92     | 388.59     | Fermi removed (must use CUDA == 8.0)
+|  8.0 | 372.70     | 386.28     | except 372.95 has CUDA7.5 
+|  7.5 |            |            | *Don't bother, won't compile anymore*
+
+nVidia generally uses the same version numbering on all OS, the above was however based
+on Windows Driver Release Notes
+nVidia always puts the runtime-included CUDA version in the release notes PDF for whatever driver, doesn't hurt to
+double check your specific one.
+
+For better navigation of CUDA version matching, xmr-stak will display both version numbers during CUDA detection phases
+such as `[9.2/10.0]` which is the compiled (SDK) version and the current (driver) runtime version.
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -903,6 +903,9 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	const char *fastIntMathV2CL =
 			#include "./opencl/fast_int_math_v2.cl"
 	;
+    const char *fastDivHeavyCL =
+        #include "./opencl/fast_div_heavy.cl"
+    ;
 	const char *cryptonightCL =
 			#include "./opencl/cryptonight.cl"
 	;
@@ -924,6 +927,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 
 	std::string source_code(cryptonightCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL);
+    source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_DIV_HEAVY"), fastDivHeavyCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_AES"), wolfAesCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_SKEIN"), wolfSkeinCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_JH"), jhCL);
@@ -965,7 +969,7 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 	input[input_len] = 0x01;
 	memset(input + input_len + 1, 0, 88 - input_len - 1);
 
-	size_t numThreads = ctx->rawIntensity;
+	cl_uint numThreads = ctx->rawIntensity;
 
 	if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->InputBuffer, CL_TRUE, 0, 88, input, 0, NULL, NULL)) != CL_SUCCESS)
 	{
@@ -994,7 +998,7 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 	}
 
 	// Threads
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 3, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 3, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 3.", err_to_str(ret));
 		return(ERR_OCL_API);
@@ -1017,7 +1021,7 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 	}
 
 	// Threads
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 2, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 2, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 2.", err_to_str(ret));
 		return(ERR_OCL_API);
@@ -1077,7 +1081,7 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 	}
 
 	// Threads
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 6, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 6, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret));
 		return(ERR_OCL_API);
@@ -1156,7 +1160,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
 
 	clFinish(ctx->CommandQueues);
 
-	size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = { g_thd, 8 }, lthreads[2] = { w_size, 8 };
+	size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = { g_thd, 8 }, lthreads[2] = { 8, 8 };
 	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 0);
@@ -1208,7 +1212,8 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
 		if(BranchNonces[i])
 		{
 			// Threads
-			if((clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 4, sizeof(cl_ulong), BranchNonces + i)) != CL_SUCCESS)
+            cl_uint numThreads = BranchNonces[i];
+			if((clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
 			{
 				printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4);
 				return(ERR_OCL_API);