unsigned thread

sp-hash · Dec 18, 2014 · 08b9199 · 08b9199
1 parent bc41e63
commit 08b9199
Show file tree

Hide file tree

Showing 62 changed files with 498 additions and 498 deletions.
diff --git a/Algo256/blake256.cu b/Algo256/blake256.cu
@@ -245,7 +245,7 @@ __host__
 uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint64_t highTarget,
 	const uint32_t crcsum, const int8_t rounds)
 {
-	const int threadsperblock = TPB;
+	const uint32_t threadsperblock = TPB;
 	uint32_t result = UINT32_MAX;
 
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
@@ -332,7 +332,7 @@ __host__
 static uint32_t blake256_cpu_hash_16(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint64_t highTarget,
 	const int8_t rounds)
 {
-	const int threadsperblock = TPB;
+	const uint32_t threadsperblock = TPB;
 	uint32_t result = UINT32_MAX;
 
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);

diff --git a/Algo256/cuda_blake256.cu b/Algo256/cuda_blake256.cu
@@ -218,7 +218,7 @@ void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uin
 __host__
 void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order)
 {
-	const int threadsperblock = 256;
+	const uint32_t threadsperblock = 256;
 
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
@@ -243,7 +243,7 @@ void blake256_cpu_setBlock_80(uint32_t *pdata)
 }
 
 __host__
-void blake256_cpu_init(int thr_id, int threads)
+void blake256_cpu_init(int thr_id, uint32_t threads)
 {
 	cudaMemcpyToSymbol(u256, c_u256, sizeof(c_u256), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(sigma, c_sigma, sizeof(c_sigma), 0, cudaMemcpyHostToDevice);

diff --git a/Algo256/cuda_fugue256.cu b/Algo256/cuda_fugue256.cu
@@ -548,7 +548,7 @@ __global__ void  __launch_bounds__(256)
 #else
 __global__ void
 #endif
-fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
+fugue256_gpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
 {
 #if USE_SHARED
 	extern __shared__ char mixtabs[];
@@ -561,7 +561,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas
 	__syncthreads();
 #endif
 
-	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		/* Nimm den State und verarbeite das letztenByte (die Nounce) */
@@ -718,7 +718,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas
 	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); }
 
 
-void fugue256_cpu_init(int thr_id, int threads)
+void fugue256_cpu_init(int thr_id, uint32_t threads)
 {
 	cudaSetDevice(device_map[thr_id]);
 
@@ -751,12 +751,12 @@ __host__ void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
 	cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
 }
 
-__host__ void fugue256_cpu_hash(int thr_id, int threads, int startNounce, void *outputHashes, uint32_t *nounce)
+__host__ void fugue256_cpu_hash(int thr_id, uint32_t threads, int startNounce, void *outputHashes, uint32_t *nounce)
 {
 #if USE_SHARED
-	const int threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN
+	const uint32_t threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN
 #else
-	const int threadsperblock = 512; // so einstellen wie gewünscht ;-)
+	const uint32_t threadsperblock = 512; // so einstellen wie gewünscht ;-)
 #endif
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);

diff --git a/Algo256/cuda_groestl256.cu b/Algo256/cuda_groestl256.cu
@@ -105,7 +105,7 @@ extern uint32_t T3up_cpu[];
 extern uint32_t T3dn_cpu[];
 
 __device__ __forceinline__
-void groestl256_perm_P(int thread,uint32_t *a, char *mixtabs)
+void groestl256_perm_P(uint32_t thread,uint32_t *a, char *mixtabs)
 {
 	#pragma unroll 10
 	for (int r = 0; r<10; r++)
@@ -136,7 +136,7 @@ void groestl256_perm_P(int thread,uint32_t *a, char *mixtabs)
 }
 
 __device__ __forceinline__
-void groestl256_perm_Q(int thread, uint32_t *a, char *mixtabs)
+void groestl256_perm_Q(uint32_t thread, uint32_t *a, char *mixtabs)
 {
 	#pragma unroll
 	for (int r = 0; r<10; r++)
@@ -175,7 +175,7 @@ void groestl256_perm_Q(int thread, uint32_t *a, char *mixtabs)
 }
 
 __global__ __launch_bounds__(256,1)
-void groestl256_gpu_hash32(int threads, uint32_t startNounce, uint64_t *outputHash, uint32_t *nonceVector)
+void groestl256_gpu_hash32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash, uint32_t *nonceVector)
 {
 #if USE_SHARED
 	extern __shared__ char mixtabs[];
@@ -194,7 +194,7 @@ void groestl256_gpu_hash32(int threads, uint32_t startNounce, uint64_t *outputHa
 	__syncthreads();
 #endif
 
-	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		// GROESTL
@@ -259,7 +259,7 @@ void groestl256_gpu_hash32(int threads, uint32_t startNounce, uint64_t *outputHa
 	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
 
 __host__
-void groestl256_cpu_init(int thr_id, int threads)
+void groestl256_cpu_init(int thr_id, uint32_t threads)
 {
 
 	// Texturen mit obigem Makro initialisieren
@@ -277,11 +277,11 @@ void groestl256_cpu_init(int thr_id, int threads)
 }
 
 __host__
-uint32_t groestl256_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
+uint32_t groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
 {
 	uint32_t result = 0xffffffff;
 	cudaMemset(d_GNonce[thr_id], 0xff, sizeof(uint32_t));
-	const int threadsperblock = 256;
+	const uint32_t threadsperblock = 256;
 
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);

diff --git a/Algo256/cuda_keccak256.cu b/Algo256/cuda_keccak256.cu
@@ -535,9 +535,9 @@ static void keccak_blockv30_80(uint64_t *s, const uint64_t *keccak_round_constan
 #endif
 
 __global__ __launch_bounds__(128,5)
-void keccak256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
+void keccak256_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
 {
-	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint32_t nounce = startNounce + thread;
@@ -575,11 +575,11 @@ void keccak256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash,
 }
 
 __host__
-uint32_t keccak256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
+uint32_t keccak256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
 {
 	uint32_t result = UINT32_MAX;
 	cudaMemset(d_KNonce[thr_id], 0xff, sizeof(uint32_t));
-	const int threadsperblock = 128;
+	const uint32_t threadsperblock = 128;
 
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
@@ -597,9 +597,9 @@ uint32_t keccak256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, ui
 }
 
 __global__ __launch_bounds__(256,3)
-void keccak256_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHash)
+void keccak256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash)
 {
-	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 #if __CUDA_ARCH__ >= 350 /* tpr: to double check if faster on SM5+ */
@@ -637,9 +637,9 @@ void keccak256_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHa
 }
 
 __host__
-void keccak256_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
+void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
 {
-	const int threadsperblock = 256;
+	const uint32_t threadsperblock = 256;
 
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
@@ -658,7 +658,7 @@ void keccak256_setBlock_80(void *pdata,const void *pTargetIn)
 }
 
 __host__
-void keccak256_cpu_init(int thr_id, int threads)
+void keccak256_cpu_init(int thr_id, uint32_t threads)
 {
 	CUDA_SAFE_CALL(cudaMalloc(&d_KNonce[thr_id], sizeof(uint32_t)));
 	CUDA_SAFE_CALL(cudaMallocHost(&d_nounce[thr_id], 1*sizeof(uint32_t)));

diff --git a/Algo256/cuda_skein256.cu b/Algo256/cuda_skein256.cu
@@ -98,9 +98,9 @@ void Round_8_512v35(uint2 *ks, uint2 *ts,
 
 
 __global__ __launch_bounds__(256,3)
-void skein256_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHash)
+void skein256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash)
 {
-	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint2 h[9];
@@ -176,15 +176,15 @@ void skein256_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHas
 }
 
 __host__
-void skein256_cpu_init(int thr_id, int threads)
+void skein256_cpu_init(int thr_id, uint32_t threads)
 {
 	//empty
 }
 
 __host__
-void skein256_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
+void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
 {
-	const int threadsperblock = 256;
+	const uint32_t threadsperblock = 256;
 
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);

diff --git a/Algo256/keccak256.cu b/Algo256/keccak256.cu
@@ -16,9 +16,9 @@ extern "C"
 
 static uint32_t *d_hash[8];
 
-extern void keccak256_cpu_init(int thr_id, int threads);
+extern void keccak256_cpu_init(int thr_id, uint32_t threads);
 extern void keccak256_setBlock_80(void *pdata,const void *ptarget);
-extern uint32_t keccak256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern uint32_t keccak256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
 
 // CPU Hash
 extern "C" void keccak256_hash(void *state, const void *input)

diff --git a/JHA/cuda_jha_compactionTest.cu b/JHA/cuda_jha_compactionTest.cu
@@ -29,7 +29,7 @@ __device__ cuda_compactTestFunction_t d_JackpotTrueFunction = JackpotTrueTest, d
 cuda_compactTestFunction_t h_JackpotTrueFunction[8], h_JackpotFalseFunction[8];
 
 // Setup-Funktionen
-__host__ void jackpot_compactTest_cpu_init(int thr_id, int threads)
+__host__ void jackpot_compactTest_cpu_init(int thr_id, uint32_t threads)
 {
 	cudaMemcpyFromSymbol(&h_JackpotTrueFunction[thr_id], d_JackpotTrueFunction, sizeof(cuda_compactTestFunction_t));
 	cudaMemcpyFromSymbol(&h_JackpotFalseFunction[thr_id], d_JackpotFalseFunction, sizeof(cuda_compactTestFunction_t));
@@ -55,7 +55,7 @@ __host__ void jackpot_compactTest_cpu_init(int thr_id, int threads)
 #endif
 
 // Die Summenfunktion (vom NVIDIA SDK)
-__global__ void jackpot_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
+__global__ void jackpot_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
 {
 	extern __shared__ uint32_t sums[];
 	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
@@ -184,7 +184,7 @@ __global__ void jackpot_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_su
 }
 
 // Der Scatter
-__global__ void jackpot_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
+__global__ void jackpot_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
 {
 	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
 	uint32_t actNounce = id;
@@ -233,7 +233,7 @@ __host__ static uint32_t jackpot_compactTest_roundUpExp(uint32_t val)
 	return mask;
 }
 
-__host__ void jackpot_compactTest_cpu_singleCompaction(int thr_id, int threads, uint32_t *nrm,
+__host__ void jackpot_compactTest_cpu_singleCompaction(int thr_id, uint32_t threads, uint32_t *nrm,
 														uint32_t *d_nonces1, cuda_compactTestFunction_t function,
 														uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
 {
@@ -289,7 +289,7 @@ __host__ void jackpot_compactTest_cpu_singleCompaction(int thr_id, int threads,
 }
 
 ////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048)
-__host__ void jackpot_compactTest_cpu_dualCompaction(int thr_id, int threads, uint32_t *nrm,
+__host__ void jackpot_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads, uint32_t *nrm,
 													 uint32_t *d_nonces1, uint32_t *d_nonces2,
 													 uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
 {
@@ -328,7 +328,7 @@ __host__ void jackpot_compactTest_cpu_dualCompaction(int thr_id, int threads, ui
 	*/
 }
 
-__host__ void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
+__host__ void jackpot_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
 											uint32_t *d_nonces1, size_t *nrm1,
 											uint32_t *d_nonces2, size_t *nrm2,
 											int order)

diff --git a/JHA/cuda_jha_keccak512.cu b/JHA/cuda_jha_keccak512.cu
@@ -100,9 +100,9 @@ keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_const
     }
 }
 
-__global__ void jackpot_keccak512_gpu_hash(int threads, uint32_t startNounce, uint64_t *g_hash)
+__global__ void jackpot_keccak512_gpu_hash(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
 {
-    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+    uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
     if (thread < threads)
     {
         uint32_t nounce = startNounce + thread;
@@ -145,7 +145,7 @@ __global__ void jackpot_keccak512_gpu_hash(int threads, uint32_t startNounce, ui
 }
 
 // Setup-Funktionen
-__host__ void jackpot_keccak512_cpu_init(int thr_id, int threads)
+__host__ void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads)
 {
     // Kopiere die Hash-Tabellen in den GPU-Speicher
     cudaMemcpyToSymbol( c_keccak_round_constants,
@@ -522,9 +522,9 @@ __host__ void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen)
                         0, cudaMemcpyHostToDevice);
 }
 
-__host__ void jackpot_keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order)
+__host__ void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order)
 {
-    const int threadsperblock = 256;
+    const uint32_t threadsperblock = 256;
 
     // berechne wie viele Thread Blocks wir brauchen
     dim3 grid((threads + threadsperblock-1)/threadsperblock);

diff --git a/JHA/jackpotcoin.cu b/JHA/jackpotcoin.cu
@@ -12,27 +12,27 @@ extern "C"
 
 static uint32_t *d_hash[8];
 
-extern void jackpot_keccak512_cpu_init(int thr_id, int threads);
+extern void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads);
 extern void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen);
-extern void jackpot_keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
 
-extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 
-extern void quark_groestl512_cpu_init(int thr_id, int threads);
-extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
+extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 
-extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 
 extern void quark_skein512_cpu_init(int thr_id);
-extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 
-extern void jackpot_compactTest_cpu_init(int thr_id, int threads);
-extern void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, 
+extern void jackpot_compactTest_cpu_init(int thr_id, uint32_t threads);
+extern void jackpot_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, 
 											uint32_t *d_nonces1, size_t *nrm1,
 											uint32_t *d_nonces2, size_t *nrm2,
 											int order);
 
-extern uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+extern uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
 
 // Speicher zur Generierung der Noncevektoren für die bedingten Hashes
 static uint32_t *d_jackpotNonces[8];