Neo scrypt 7% faster on the 750ti 4% faster on the 970

sp-hash · Jul 25, 2015 · 3dd9b76 · 3dd9b76
1 parent 79f3cd1
commit 3dd9b76
Show file tree

Hide file tree

Showing 2 changed files with 111 additions and 53 deletions.
diff --git a/cuda_vector.h b/cuda_vector.h
@@ -770,7 +770,7 @@ static __device__ __inline__ uint32_t __ldgtoint_unaligned2(const uint8_t *ptr)
 
 #endif
 
-static __forceinline__ __device__ void shift256R2(uint32_t * ret, const uint8 &vec4, uint32_t shift)
+static __forceinline__ __device__ void shift256R2(uint32_t * ret, const uint8 &vec4, const uint32_t shift)
 {
 	uint32_t truc = 0, truc2 = cuda_swab32(vec4.s7), truc3 = 0;
 	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));

diff --git a/neoscrypt/cuda_neoscrypt.cu b/neoscrypt/cuda_neoscrypt.cu
@@ -346,6 +346,7 @@ static __forceinline__ __host__ void Blake2Shost(uint32_t * inout, const uint32_
 
 	V.hi.s4 ^= BLAKE2S_BLOCK_SIZE;
 
+
 	for (int x = 0; x < 10; ++x)
 	{
 		BLAKE_Ghost(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inkey);
@@ -392,56 +393,80 @@ static __forceinline__ __device__ void fastkdf256(int thread, const uint32_t * p
 	uchar4 bufhelper;
 	uint8_t A[320],B[288]; 
 
- ((uintx64*)A)[0] = ((uintx64*)password)[0];
-  ((uint816 *)A)[4] =  ((uint816 *)password)[0];
-
-((uintx64*)B)[0] = ((uintx64*)password)[0];
-  ((uint48 *)B)[8] = ((uint48 *)password)[0];
+	 ((uintx64*)A)[0] = ((uintx64*)password)[0];
+	((uint816 *)A)[4] =  ((uint816 *)password)[0];
 
-uint32_t input[BLAKE2S_BLOCK_SIZE/4]; uint32_t key[BLAKE2S_BLOCK_SIZE / 4]={0};
+	((uintx64*)B)[0] = ((uintx64*)password)[0];
+	((uint48 *)B)[8] = ((uint48 *)password)[0];
 
-((uint816*)input)[0] = ((uint816*)input_init)[0];
-((uint48*)key)[0] = ((uint48*)key_init)[0];
+	uint32_t input[BLAKE2S_BLOCK_SIZE/4]; uint32_t key[BLAKE2S_BLOCK_SIZE / 4]={0};
 
+	((uint816*)input)[0] = ((uint816*)input_init)[0];
+	((uint48*)key)[0] = ((uint48*)key_init)[0];
 
-	for (int i = 0; i < 32; ++i)
+	#pragma unroll
+	for (int i = 0; i < 31; ++i)
 	{
 
-//		Blake2Stest(thread,input, key);
-
 		bufhelper = ((uchar4*)input)[0];
+		#pragma unroll
 		for (int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) { bufhelper += ((uchar4*)input)[x]; }
 		bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w;
 
-		int qbuf = bufidx/4; 
+		int qbuf = bufidx >> 2;
         int rbuf = bufidx&3;
 		int bitbuf = rbuf << 3; 
 		 uint32_t shifted[9];
 
 		shift256R2(shifted, ((uint8*)input)[0], bitbuf);
 
-		for (int k = 0; k < 9; ++k) {
+		#pragma unroll
+		for (int k = 0; k < 9; ++k) 
+		{
 			((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k];
 		}
 
 		if (bufidx < BLAKE2S_KEY_SIZE)                          {((uint8*)B)[8] = ((uint8*)B)[0];}
 		else if (bufidx > FASTKDF_BUFFER_SIZE-BLAKE2S_OUT_SIZE) {((uint8*)B)[0] = ((uint8*)B)[8];}
-
-		if (i<31) {
-		for (int k = 0; k <BLAKE2S_BLOCK_SIZE / 4; k++) {
+		#pragma unroll
+		for (int k = 0; k <BLAKE2S_BLOCK_SIZE / 4; k++) 
+		{
 			((uchar4*)(input))[k] = make_uchar4((A + bufidx)[4 * k], (A + bufidx)[4 * k + 1], 
                                                 (A + bufidx)[4 * k + 2], (A + bufidx)[4 * k + 3]);
 		}
 
-		for (int k = 0; k <BLAKE2S_KEY_SIZE / 4; k++) {
+		#pragma unroll
+		for (int k = 0; k <BLAKE2S_KEY_SIZE / 4; k++) 
+		{
 			((uchar4*)(key))[k] = make_uchar4((B + bufidx)[4 * k], (B + bufidx)[4 * k + 1], 
-                                              (B + bufidx)[4 * k + 2], (B + bufidx)[4 * k + 3]);
+                                            (B + bufidx)[4 * k + 2], (B + bufidx)[4 * k + 3]);
 		} 
 		Blake2S((uint32_t*)input, key);
-        }
+
 	} 
+	bufhelper = ((uchar4*)input)[0];
+	#pragma unroll
+	for (int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) { bufhelper += ((uchar4*)input)[x]; }
+	bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w;
+
+	int qbuf = bufidx >> 2;
+	int rbuf = bufidx & 3;
+	int bitbuf = rbuf << 3;
+	uint32_t shifted[9];
+
+	shift256R2(shifted, ((uint8*)input)[0], bitbuf);
+
+#pragma unroll
+	for (int k = 0; k < 9; ++k) {
+		((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k];
+	}
+
+	if (bufidx < BLAKE2S_KEY_SIZE)                          { ((uint8*)B)[8] = ((uint8*)B)[0]; }
+	else if (bufidx > FASTKDF_BUFFER_SIZE - BLAKE2S_OUT_SIZE) { ((uint8*)B)[0] = ((uint8*)B)[8]; }
+
+
 	int left = FASTKDF_BUFFER_SIZE - bufidx;
-    int qleft =left/4;
+	int qleft = left >> 2;
     int rleft =left&3; 
 	for (int k = 0; k < qleft; ++k) { ((uchar4*)output)[k] = 
 make_uchar4((B + bufidx)[4 * k], (B + bufidx)[4 * k + 1], 
@@ -470,17 +495,19 @@ static __forceinline__ __device__ void fastkdf32( const uint32_t * password, con
 	((uint816*)A)[4] = ((uint816*)password)[0];
 	((uintx64*)B)[0] = ((uintx64*)salt)[0];
     ((uintx64*)B)[1] = ((uintx64*)salt)[0];
-uint32_t input[BLAKE2S_BLOCK_SIZE/4]; uint32_t key[BLAKE2S_BLOCK_SIZE/4]={0};
-((uint816*)input)[0] = ((uint816*)password)[0];
-((uint48*)key)[0] = ((uint48*)salt)[0];
+	uint32_t input[BLAKE2S_BLOCK_SIZE/4]; uint32_t key[BLAKE2S_BLOCK_SIZE/4]={0};
+	((uint816*)input)[0] = ((uint816*)password)[0];
+	((uint48*)key)[0] = ((uint48*)salt)[0];
 
-	for (int i = 0; i < 32; ++i) 
+	#pragma unroll
+	for (int i = 0; i < 31; ++i)
 	{ 
 
 		Blake2S((uint32_t*)input, key);
 
 		bufidx = 0;
 		bufhelper = ((uchar4*)input)[0];
+		#pragma unroll
 		for (int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) { bufhelper += ((uchar4*)input)[x]; }
 		bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w;
 		int qbuf = bufidx / 4;
@@ -490,26 +517,47 @@ uint32_t input[BLAKE2S_BLOCK_SIZE/4]; uint32_t key[BLAKE2S_BLOCK_SIZE/4]={0};
 
 		shift256R2(shifted, ((uint8*)input)[0], bitbuf);
 
-		for (int k = 0; k < 9; ++k) {
+	#pragma unroll
+		for (int k = 0; k < 9; ++k)
+		{
 			((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k];
 		}
 
-		if (i<31){
 		if (bufidx < BLAKE2S_KEY_SIZE)                            {((uint8*)B)[8] = ((uint8*)B)[0];}
 		else if (bufidx > FASTKDF_BUFFER_SIZE - BLAKE2S_OUT_SIZE) {((uint8*)B)[0] = ((uint8*)B)[8];}
-//		MyUnion Test;
-
-		for (uint8_t k = 0; k <BLAKE2S_BLOCK_SIZE/4 ; k++) {
-	((uchar4*)(input))[k] =
-	make_uchar4((A + bufidx)[4 * k], (A + bufidx)[4 * k + 1], (A + bufidx)[4 * k + 2], (A + bufidx)[4 * k + 3]);			
-		}
-		for (uint8_t k = 0; k <BLAKE2S_KEY_SIZE / 4; k++) {
-	((uchar4*)(key))[k] =
-	make_uchar4((B + bufidx)[4 * k], (B + bufidx)[4 * k + 1], (B + bufidx)[4 * k + 2], (B + bufidx)[4 * k + 3]);
+		#pragma unroll
+		for (uint8_t k = 0; k <BLAKE2S_BLOCK_SIZE / 4; k++)
+		{
+			((uchar4*)(input))[k] =
+			make_uchar4((A + bufidx)[4 * k], (A + bufidx)[4 * k + 1], (A + bufidx)[4 * k + 2], (A + bufidx)[4 * k + 3]);			
 		}
+		#pragma unroll
+		for (uint8_t k = 0; k <BLAKE2S_KEY_SIZE / 4; k++)
+		{
+			((uchar4*)(key))[k] =
+			make_uchar4((B + bufidx)[4 * k], (B + bufidx)[4 * k + 1], (B + bufidx)[4 * k + 2], (B + bufidx)[4 * k + 3]);
 		}
 	}
-
+	Blake2S((uint32_t*)input, key);
+
+	bufidx = 0;
+	bufhelper = ((uchar4*)input)[0];
+	#pragma unroll
+	for (int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) { bufhelper += ((uchar4*)input)[x]; }
+	bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w;
+	int qbuf = bufidx / 4;
+	int rbuf = bufidx & 3;
+	int bitbuf = rbuf << 3;
+	uint32_t shifted[9];
+
+	shift256R2(shifted, ((uint8*)input)[0], bitbuf);
+
+	#pragma unroll
+	for (int k = 0; k < 9; ++k)
+	{
+		((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k];
+	}
+
 
 	uchar4 unfucked[1];
     unfucked[0] = make_uchar4(B[28 + bufidx], B[29 + bufidx],B[30 + bufidx], B[31 + bufidx]);
@@ -569,8 +617,7 @@ c += d; b = rotate(b^c, 7); \
 
 
 
-
-static __forceinline__ __device__ uint16 salsa_small_scalar_rnd(const uint16 &X)
+__forceinline__ __device__ uint16 salsa_small_scalar_rnd(const uint16 &X)
 {
 	uint16 state = X;
 	uint32_t t;
@@ -580,7 +627,7 @@ static __forceinline__ __device__ uint16 salsa_small_scalar_rnd(const uint16 &X)
 	return(X + state);
 }
 
-static __device__ __forceinline__ uint16 chacha_small_parallel_rnd(const uint16 &X)
+__device__ __forceinline__ uint16 chacha_small_parallel_rnd(const uint16 &X)
 { 
 
 	uint16 st = X; 
@@ -621,18 +668,25 @@ static __device__ __forceinline__ void neoscrypt_salsa(uint16 *XV)
 #define SHIFT 130
 
 
-__global__ __launch_bounds__(128, 1) void neoscrypt_gpu_hash_k0(int stratum, int threads, uint32_t startNonce)
+__global__
+#if __CUDA_ARCH__ > 500
+__launch_bounds__(128, 2)
+#else
+__launch_bounds__(128, 3)
+#endif
+void neoscrypt_gpu_hash_k0(int stratum, int threads, uint32_t startNonce)
 {
 
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	int shift = SHIFT * 16 * thread;
-//	if (thread < threads)
+	if (thread < threads)
 	{
 		const uint32_t nonce = startNonce + thread;
 
 		uint16 X[4];
 		uint32_t data[80];
 
+		#pragma unroll
 		for (int i = 0; i<20; i++) { ((uint4*)data)[i] = ((uint4 *)c_data)[i]; }  //ld.local.v4
 		data[19] = (stratum) ? cuda_swab32(nonce) : nonce; //freaking morons !!!
 		data[39] = data[19];
@@ -645,19 +699,19 @@ __global__ __launch_bounds__(128, 1) void neoscrypt_gpu_hash_k0(int stratum, int
 	}
 }
 
-__global__ __launch_bounds__(128, 1) void neoscrypt_gpu_hash_k01(int threads, uint32_t startNonce)
+__global__ __launch_bounds__(128, 2) void neoscrypt_gpu_hash_k01(int threads, uint32_t startNonce)
 {
 
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	int shift = SHIFT * 16 * thread;
-//	if (thread < threads)
+	if (thread < threads)
 	{
 
 
 		uint16 X[4];
 		((uintx64 *)X)[0]= __ldg32(&(W + shift)[0]);
 
-//#pragma unroll
+		#pragma unroll
 		for (int i = 0; i < 128; ++i)
 		{			
 			neoscrypt_chacha(X);
@@ -670,16 +724,17 @@ __global__ __launch_bounds__(128, 1) void neoscrypt_gpu_hash_k01(int threads, ui
 	}
 }
 
-__global__ __launch_bounds__(128, 1) void neoscrypt_gpu_hash_k2(int threads, uint32_t startNonce)
+__global__ __launch_bounds__(128, 2) void neoscrypt_gpu_hash_k2(int threads, uint32_t startNonce)
 {
 
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	int shift = SHIFT * 16 * thread;
-//	if (thread < threads)
+	if (thread < threads)
 	{
 		uint16 X[4];
 		((uintx64 *)X)[0] = __ldg32(&(W + shift)[2048]);
 
+		#pragma unroll 
 		for (int t = 0; t < 128; t++)
 		{
 			int idx = X[3].lo.s0 & 0x7F;
@@ -692,10 +747,10 @@ __global__ __launch_bounds__(128, 1) void neoscrypt_gpu_hash_k2(int threads, uin
 	}
 }
 
-__global__ __launch_bounds__(128, 1) void neoscrypt_gpu_hash_k3(int threads, uint32_t startNonce)
+__global__ __launch_bounds__(128, 2) void neoscrypt_gpu_hash_k3(int threads, uint32_t startNonce)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-//	if (thread < threads)
+	if (thread < threads)
 	{
 
 		int shift = SHIFT * 16 * thread;
@@ -704,7 +759,7 @@ __global__ __launch_bounds__(128, 1) void neoscrypt_gpu_hash_k3(int threads, uin
 
 		((uintx64*)Z)[0] = __ldg32(&(W + shift)[0]);
 
-//#pragma unroll 
+		#pragma unroll 
 		for (int i = 0; i < 128; ++i)
 		{
 			neoscrypt_salsa(Z);
@@ -716,11 +771,12 @@ __global__ __launch_bounds__(128, 1) void neoscrypt_gpu_hash_k3(int threads, uin
 	}
 }
 
-__global__ __launch_bounds__(128, 1) void neoscrypt_gpu_hash_k4(int stratum,int threads, uint32_t startNonce, uint32_t *nonceVector)
+
+__global__ __launch_bounds__(32, 12) void neoscrypt_gpu_hash_k4(int stratum,int threads, uint32_t startNonce, uint32_t *nonceVector)
 {
 
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-//	if (thread < threads)
+	if (thread < threads)
 	{
 		const uint32_t nonce = startNonce + thread;
 
@@ -730,11 +786,13 @@ __global__ __launch_bounds__(128, 1) void neoscrypt_gpu_hash_k4(int stratum,int
 
 		uint32_t data[80];
 
+		#pragma unroll
 		for (int i = 0; i<20; i++) { ((uint4*)data)[i] = ((uint4 *)c_data)[i]; }
 		data[19] = (stratum) ? cuda_swab32(nonce) : nonce; 
 		data[39] = data[19];
 		data[59] = data[19];
 		((uintx64 *)Z)[0] = __ldg32(&(W + shift)[2048]);
+		#pragma unroll
 		for (int t = 0; t < 128; t++)
 		{
 			int idx = Z[3].lo.s0 & 0x7F; 
@@ -765,7 +823,7 @@ __host__ uint32_t neoscrypt_cpu_hash_k4(int stratum,int thr_id, int threads, uin
 	cudaMemset(d_NNonce[thr_id], 0xffffffff, sizeof(uint32_t));
 
 
-	const int threadsperblock = 128;
+	const int threadsperblock = 32;
 
 
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);