diff --git a/algos.h b/algos.h index 1a16fb193c..29c6ef09a9 100644 --- a/algos.h +++ b/algos.h @@ -24,7 +24,7 @@ enum sha_algos { ALGO_X11, ALGO_X11EVO, ALGO_C11, - ALGO_SIB, + ALGO_POLY, ALGO_X13, ALGO_X14, ALGO_X15, @@ -61,7 +61,7 @@ static const char *algo_names[] = { "x11", "x11evo", "c11", - "sib", + "poly", "x13", "x14", "x15", diff --git a/bench.cpp b/bench.cpp index e3926b0a6a..e6696bce3a 100644 --- a/bench.cpp +++ b/bench.cpp @@ -60,7 +60,7 @@ void algo_free_all(int thr_id){ free_x11(thr_id); free_x11evo(thr_id); free_c11(thr_id); - free_sib(thr_id); + free_poly(thr_id); free_x13(thr_id); free_x14(thr_id); free_x15(thr_id); diff --git a/ccminer.cpp b/ccminer.cpp index cd9e7964b5..ac16cbb299 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -229,7 +229,7 @@ Options:\n\ qubit Qubit\n\ x11 X11 (DarkCoin)\n\ c11 C11 (Chaincoin)\n\ - sib X11+gost (Sibcoin)\n\ + poly Veltor+stuff (Polytimos)\n\ x11evo Permuted x11 (Revolver)\n\ x13 X13 (MaruCoin)\n\ x14 X14 (BernCoin)\n\ @@ -1928,7 +1928,7 @@ static void *miner_thread(void *userdata) minmax = 0x8000000; break; case ALGO_NEOSCRYPT: - case ALGO_SIB: + case ALGO_POLY: case ALGO_VELTOR: case ALGO_LYRA2: minmax = 0x80000; @@ -2063,8 +2063,8 @@ static void *miner_thread(void *userdata) case ALGO_HSR: rc = scanhash_hsr(thr_id, &work, max_nonce, &hashes_done); break; - case ALGO_SIB: - rc = scanhash_sib(thr_id, &work, max_nonce, &hashes_done); + case ALGO_POLY: + rc = scanhash_poly(thr_id, &work, max_nonce, &hashes_done); break; case ALGO_VELTOR: rc = scanhash_veltor(thr_id, &work, max_nonce, &hashes_done); diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 10a006d03d..ceb7cf8f24 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -414,6 +414,7 @@ 72 + diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index a3d68641da..15f4f287e6 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -628,6 +628,9 @@ Source Files\CUDA\x13 + + Source Files\CUDA\x13 + Source Files\CUDA\quark diff --git a/miner.h b/miner.h index 7624acb2e1..34ce49cba2 100644 --- a/miner.h +++ b/miner.h @@ -282,7 +282,7 @@ extern int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce, uns extern int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_x11evo(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); -extern int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_poly(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_x14(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_x15(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); @@ -322,7 +322,7 @@ extern void free_whirl(int thr_id); extern void free_x11(int thr_id); extern void free_x11evo(int thr_id); extern void free_c11(int thr_id); -extern void free_sib(int thr_id); +extern void free_poly(int thr_id); extern void free_x13(int thr_id); extern void free_x14(int thr_id); extern void free_x15(int thr_id); @@ -805,7 +805,7 @@ void wcoinhash(void *state, const void *input); void x11hash(void *output, const void *input); void x11evo_hash(void *output, const void *input); void c11hash(void *output, const void *input); -void sibhash(void *output, const void *input); +void polyhash(void *output, const void *input); void x13hash(void *output, const void *input); void x14hash(void *output, const void *input); void x15hash(void *output, const void *input); diff --git a/quark/cuda_quark_skein512.cu b/quark/cuda_quark_skein512.cu index 5db1873e61..b21ce5a8c7 100644 --- a/quark/cuda_quark_skein512.cu +++ b/quark/cuda_quark_skein512.cu @@ -420,8 +420,150 @@ void skein512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *outp } } - __host__ +void skein512_cpu_setBlock_80( void *pdata) +{ + uint64_t message[20]; + memcpy(&message[0], pdata, 80); + + uint64_t p[8]; + uint64_t h[9]; + uint64_t t0, t1, t2; + + h[0] = 0x4903ADFF749C51CEull; + h[1] = 0x0D95DE399746DF03ull; + h[2] = 0x8FD1934127C79BCEull; + h[3] = 0x9A255629FF352CB1ull; + h[4] = 0x5DB62599DF6CA7B0ull; + h[5] = 0xEABE394CA9D5C3F4ull; + h[6] = 0x991112C71A75B523ull; + h[7] = 0xAE18A40B660FCC33ull; + // h[8] = h[0] ^ h[1] ^ h[2] ^ h[3] ^ h[4] ^ h[5] ^ h[6] ^ h[7] ^ SPH_C64(0x1BD11BDAA9FC1A22); + h[8] = 0xcab2076d98173ec4ULL; + + t0 = 64; // ptr + t1 = 0x7000000000000000ull; + t2 = 0x7000000000000040ull; + + memcpy(&p[0], &message[0], 64); + + TFBIG_4e_PRE(0); + TFBIG_4o_PRE(1); + TFBIG_4e_PRE(2); + TFBIG_4o_PRE(3); + TFBIG_4e_PRE(4); + TFBIG_4o_PRE(5); + TFBIG_4e_PRE(6); + TFBIG_4o_PRE(7); + TFBIG_4e_PRE(8); + TFBIG_4o_PRE(9); + TFBIG_4e_PRE(10); + TFBIG_4o_PRE(11); + TFBIG_4e_PRE(12); + TFBIG_4o_PRE(13); + TFBIG_4e_PRE(14); + TFBIG_4o_PRE(15); + TFBIG_4e_PRE(16); + TFBIG_4o_PRE(17); + TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); + message[10] = message[0] ^ p[0]; + message[11] = message[1] ^ p[1]; + message[12] = message[2] ^ p[2]; + message[13] = message[3] ^ p[3]; + message[14] = message[4] ^ p[4]; + message[15] = message[5] ^ p[5]; + message[16] = message[6] ^ p[6]; + message[17] = message[7] ^ p[7]; + message[18] = t2; + + uint64_t buffer[128]; + +// buffer[ 0] = message[ 8]; + buffer[ 0] = message[ 9]; + h[0] = buffer[ 1] = message[10]; + h[1] = buffer[ 2] = message[11]; + h[2] = buffer[ 3] = message[12]; + h[3] = buffer[ 4] = message[13]; + h[4] = buffer[ 5] = message[14]; + h[5] = buffer[ 6] = message[15]; + h[6] = buffer[ 7] = message[16]; + h[7] = buffer[ 8] = message[17]; + h[8] = buffer[ 9] = h[0]^h[1]^h[2]^h[3]^h[4]^h[5]^h[6]^h[7]^0x1BD11BDAA9FC1A22ULL; + + t0 = 0x50ull; + t1 = 0xB000000000000000ull; + t2 = t0^t1; + + p[0] = message[ 8] + h[0]; + p[2] = h[2]; p[3] = h[3]; p[4] = h[4]; + p[5] = h[5] + t0; + p[6] = h[6] + t1; + p[7] = h[7]; + + p[2] += p[3]; + p[4] += p[5]; p[6] += p[7]; + + p[3] = ROTL64(p[3], 36) ^ p[2]; + p[5] = ROTL64(p[5], 19) ^ p[4]; + p[7] = ROTL64(p[7], 37) ^ p[6]; + p[4] += p[7]; p[6] += p[5]; + + p[7] = ROTL64(p[7], 27) ^ p[4]; + p[5] = ROTL64(p[5], 14) ^ p[6]; + buffer[10] = p[ 0]; + buffer[11] = p[ 2]; + buffer[12] = p[ 3]; + buffer[13] = p[ 4]; + buffer[14] = p[ 5]; + buffer[15] = p[ 6]; + buffer[16] = p[ 7]; + buffer[17] = ROTL64(p[3], 42); + buffer[18] = ROTL64(p[5], 36); + buffer[19] = ROTL64(p[7], 39); + + buffer[20] = h[6]+t1; + buffer[21] = h[8]+1; + buffer[22] = h[7]+t2; + buffer[23] = h[0]+2; + buffer[24] = h[8]+t0; + buffer[25] = h[1]+3; + buffer[26] = h[0]+t1; + buffer[27] = h[2]+4; + buffer[28] = h[1]+t2; + buffer[29] = h[3]+5; + buffer[30] = h[2]+t0; + buffer[31] = h[4]+6; + buffer[32] = h[3]+t1; + buffer[33] = h[5]+7; + buffer[34] = h[4]+t2; + buffer[35] = h[6]+8; + buffer[36] = h[5]+t0; + buffer[37] = h[7]+9; + buffer[38] = h[6]+t1; + buffer[39] = h[8]+10; + buffer[40] = h[7]+t2; + buffer[41] = h[0]+11; + buffer[42] = h[8]+t0; + buffer[43] = h[1]+12; + buffer[44] = h[0]+t1; + buffer[45] = h[2]+13; + buffer[46] = h[1]+t2; + buffer[47] = h[3]+14; + buffer[48] = h[2]+t0; + buffer[49] = h[4]+15; + buffer[50] = h[3]+t1; + buffer[51] = h[5]+16; + buffer[52] = h[4]+t2; + buffer[53] = h[6]+17; + buffer[54] = h[5]+t0; + buffer[55] = h[7]+18; + buffer[56] = h[6]+t1; + + buffer[57] = message[ 8]; + + cudaMemcpyToSymbol(c_buffer, buffer, sizeof(c_buffer), 0, cudaMemcpyHostToDevice); +} +/* void skein512_cpu_setBlock_80(void *pdata) { uint64_t message[20]; @@ -564,7 +706,7 @@ void skein512_cpu_setBlock_80(void *pdata) CUDA_SAFE_CALL(cudaGetLastError()); } - +*/ __host__ void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *g_hash) { diff --git a/quark/skein_header.h b/quark/skein_header.h new file mode 100644 index 0000000000..460b311f57 --- /dev/null +++ b/quark/skein_header.h @@ -0,0 +1,385 @@ +/* Elementary defines for SKEIN */ + +/* + * M9_ ## s ## _ ## i evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7). + */ + +#define M9_0_0 0 +#define M9_0_1 1 +#define M9_0_2 2 +#define M9_0_3 3 +#define M9_0_4 4 +#define M9_0_5 5 +#define M9_0_6 6 +#define M9_0_7 7 + +#define M9_1_0 1 +#define M9_1_1 2 +#define M9_1_2 3 +#define M9_1_3 4 +#define M9_1_4 5 +#define M9_1_5 6 +#define M9_1_6 7 +#define M9_1_7 8 + +#define M9_2_0 2 +#define M9_2_1 3 +#define M9_2_2 4 +#define M9_2_3 5 +#define M9_2_4 6 +#define M9_2_5 7 +#define M9_2_6 8 +#define M9_2_7 0 + +#define M9_3_0 3 +#define M9_3_1 4 +#define M9_3_2 5 +#define M9_3_3 6 +#define M9_3_4 7 +#define M9_3_5 8 +#define M9_3_6 0 +#define M9_3_7 1 + +#define M9_4_0 4 +#define M9_4_1 5 +#define M9_4_2 6 +#define M9_4_3 7 +#define M9_4_4 8 +#define M9_4_5 0 +#define M9_4_6 1 +#define M9_4_7 2 + +#define M9_5_0 5 +#define M9_5_1 6 +#define M9_5_2 7 +#define M9_5_3 8 +#define M9_5_4 0 +#define M9_5_5 1 +#define M9_5_6 2 +#define M9_5_7 3 + +#define M9_6_0 6 +#define M9_6_1 7 +#define M9_6_2 8 +#define M9_6_3 0 +#define M9_6_4 1 +#define M9_6_5 2 +#define M9_6_6 3 +#define M9_6_7 4 + +#define M9_7_0 7 +#define M9_7_1 8 +#define M9_7_2 0 +#define M9_7_3 1 +#define M9_7_4 2 +#define M9_7_5 3 +#define M9_7_6 4 +#define M9_7_7 5 + +#define M9_8_0 8 +#define M9_8_1 0 +#define M9_8_2 1 +#define M9_8_3 2 +#define M9_8_4 3 +#define M9_8_5 4 +#define M9_8_6 5 +#define M9_8_7 6 + +#define M9_9_0 0 +#define M9_9_1 1 +#define M9_9_2 2 +#define M9_9_3 3 +#define M9_9_4 4 +#define M9_9_5 5 +#define M9_9_6 6 +#define M9_9_7 7 + +#define M9_10_0 1 +#define M9_10_1 2 +#define M9_10_2 3 +#define M9_10_3 4 +#define M9_10_4 5 +#define M9_10_5 6 +#define M9_10_6 7 +#define M9_10_7 8 + +#define M9_11_0 2 +#define M9_11_1 3 +#define M9_11_2 4 +#define M9_11_3 5 +#define M9_11_4 6 +#define M9_11_5 7 +#define M9_11_6 8 +#define M9_11_7 0 + +#define M9_12_0 3 +#define M9_12_1 4 +#define M9_12_2 5 +#define M9_12_3 6 +#define M9_12_4 7 +#define M9_12_5 8 +#define M9_12_6 0 +#define M9_12_7 1 + +#define M9_13_0 4 +#define M9_13_1 5 +#define M9_13_2 6 +#define M9_13_3 7 +#define M9_13_4 8 +#define M9_13_5 0 +#define M9_13_6 1 +#define M9_13_7 2 + +#define M9_14_0 5 +#define M9_14_1 6 +#define M9_14_2 7 +#define M9_14_3 8 +#define M9_14_4 0 +#define M9_14_5 1 +#define M9_14_6 2 +#define M9_14_7 3 + +#define M9_15_0 6 +#define M9_15_1 7 +#define M9_15_2 8 +#define M9_15_3 0 +#define M9_15_4 1 +#define M9_15_5 2 +#define M9_15_6 3 +#define M9_15_7 4 + +#define M9_16_0 7 +#define M9_16_1 8 +#define M9_16_2 0 +#define M9_16_3 1 +#define M9_16_4 2 +#define M9_16_5 3 +#define M9_16_6 4 +#define M9_16_7 5 + +#define M9_17_0 8 +#define M9_17_1 0 +#define M9_17_2 1 +#define M9_17_3 2 +#define M9_17_4 3 +#define M9_17_5 4 +#define M9_17_6 5 +#define M9_17_7 6 + +#define M9_18_0 0 +#define M9_18_1 1 +#define M9_18_2 2 +#define M9_18_3 3 +#define M9_18_4 4 +#define M9_18_5 5 +#define M9_18_6 6 +#define M9_18_7 7 + +/* + * M3_ ## s ## _ ## i evaluates to s+i mod 3 (0 <= s <= 18, 0 <= i <= 1). + */ + +#define M3_0_0 0 +#define M3_0_1 1 +#define M3_1_0 1 +#define M3_1_1 2 +#define M3_2_0 2 +#define M3_2_1 0 +#define M3_3_0 0 +#define M3_3_1 1 +#define M3_4_0 1 +#define M3_4_1 2 +#define M3_5_0 2 +#define M3_5_1 0 +#define M3_6_0 0 +#define M3_6_1 1 +#define M3_7_0 1 +#define M3_7_1 2 +#define M3_8_0 2 +#define M3_8_1 0 +#define M3_9_0 0 +#define M3_9_1 1 +#define M3_10_0 1 +#define M3_10_1 2 +#define M3_11_0 2 +#define M3_11_1 0 +#define M3_12_0 0 +#define M3_12_1 1 +#define M3_13_0 1 +#define M3_13_1 2 +#define M3_14_0 2 +#define M3_14_1 0 +#define M3_15_0 0 +#define M3_15_1 1 +#define M3_16_0 1 +#define M3_16_1 2 +#define M3_17_0 2 +#define M3_17_1 0 +#define M3_18_0 0 +#define M3_18_1 1 + +#define XCAT(x, y) XCAT_(x, y) +#define XCAT_(x, y) x ## y + +#define SKBI(k, s, i) XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i)) +#define SKBT(t, s, v) XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v)) + +#define TFBIG_ADDKEY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \ + w0 = (w0 + SKBI(k, s, 0)); \ + w1 = (w1 + SKBI(k, s, 1)); \ + w2 = (w2 + SKBI(k, s, 2)); \ + w3 = (w3 + SKBI(k, s, 3)); \ + w4 = (w4 + SKBI(k, s, 4)); \ + w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \ + w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \ + w7 = (w7 + SKBI(k, s, 7) + make_uint2(s,0); \ + } + +#define TFBIG_MIX(x0, x1, rc) { \ + x0 = x0 + x1; \ + x1 = ROL2(x1, rc) ^ x0; \ + } + +#define TFBIG_MIX8(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \ + TFBIG_MIX(w0, w1, rc0); \ + TFBIG_MIX(w2, w3, rc1); \ + TFBIG_MIX(w4, w5, rc2); \ + TFBIG_MIX(w6, w7, rc3); \ + } + +#define TFBIG_4e(s) { \ + TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ + TFBIG_MIX8(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \ + TFBIG_MIX8(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \ + TFBIG_MIX8(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \ + TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44, 9, 54, 56); \ + } + +#define TFBIG_4o(s) { \ + TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ + TFBIG_MIX8(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \ + TFBIG_MIX8(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \ + TFBIG_MIX8(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \ + TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 8, 35, 56, 22); \ + } + +#define TFBIG_KINIT_UI2(k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2) { \ + k8 = ((k0 ^ k1) ^ (k2 ^ k3)) ^ ((k4 ^ k5) ^ (k6 ^ k7)) \ + ^ vectorize(0x1BD11BDAA9FC1A22); \ + t2 = t0 ^ t1; \ + } + +#define TFBIG_ADDKEY_UI2(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \ + w0 = (w0 + SKBI(k, s, 0)); \ + w1 = (w1 + SKBI(k, s, 1)); \ + w2 = (w2 + SKBI(k, s, 2)); \ + w3 = (w3 + SKBI(k, s, 3)); \ + w4 = (w4 + SKBI(k, s, 4)); \ + w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \ + w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \ + w7 = (w7 + SKBI(k, s, 7) + vectorize(s)); \ + } + +#define TFBIG_ADDKEY_PRE(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \ + w0 = (w0 + SKBI(k, s, 0)); \ + w1 = (w1 + SKBI(k, s, 1)); \ + w2 = (w2 + SKBI(k, s, 2)); \ + w3 = (w3 + SKBI(k, s, 3)); \ + w4 = (w4 + SKBI(k, s, 4)); \ + w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \ + w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \ + w7 = (w7 + SKBI(k, s, 7) + (s)); \ + } + +#define TFBIG_MIX_UI2(x0, x1, rc) { \ + x0 = x0 + x1; \ + x1 = ROL2(x1, rc) ^ x0; \ + } + +#define TFBIG_MIX_PRE(x0, x1, rc) { \ + x0 = x0 + x1; \ + x1 = ROTL64(x1, rc) ^ x0; \ + } + +#define TFBIG_MIX8_UI2(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \ + TFBIG_MIX_UI2(w0, w1, rc0); \ + TFBIG_MIX_UI2(w2, w3, rc1); \ + TFBIG_MIX_UI2(w4, w5, rc2); \ + TFBIG_MIX_UI2(w6, w7, rc3); \ + } + +#define TFBIG_MIX8_PRE(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \ + TFBIG_MIX_PRE(w0, w1, rc0); \ + TFBIG_MIX_PRE(w2, w3, rc1); \ + TFBIG_MIX_PRE(w4, w5, rc2); \ + TFBIG_MIX_PRE(w6, w7, rc3); \ + } + +#define TFBIG_4e_UI2(s) { \ + TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ + TFBIG_MIX8_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \ + TFBIG_MIX8_UI2(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \ + TFBIG_MIX8_UI2(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \ + TFBIG_MIX8_UI2(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44, 9, 54, 56); \ + } + +#define TFBIG_4e_PRE(s) { \ + TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ + TFBIG_MIX8_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \ + TFBIG_MIX8_PRE(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \ + TFBIG_MIX8_PRE(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \ + TFBIG_MIX8_PRE(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44, 9, 54, 56); \ + } + +#define TFBIG_4o_UI2(s) { \ + TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ + TFBIG_MIX8_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \ + TFBIG_MIX8_UI2(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \ + TFBIG_MIX8_UI2(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \ + TFBIG_MIX8_UI2(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 8, 35, 56, 22); \ + } + +#define TFBIG_4o_PRE(s) { \ + TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ + TFBIG_MIX8_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \ + TFBIG_MIX8_PRE(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \ + TFBIG_MIX8_PRE(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \ + TFBIG_MIX8_PRE(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 8, 35, 56, 22); \ + } + +#define TFBIGMIX8e(){\ + p[ 0]+=p[ 1];p[ 2]+=p[ 3];p[ 4]+=p[ 5];p[ 6]+=p[ 7];p[ 1]=ROL2(p[ 1],46) ^ p[ 0];p[ 3]=ROL2(p[ 3],36) ^ p[ 2];p[ 5]=ROL2(p[ 5],19) ^ p[ 4];p[ 7]=ROL2(p[ 7],37) ^ p[ 6];\ + p[ 2]+=p[ 1];p[ 4]+=p[ 7];p[ 6]+=p[ 5];p[ 0]+=p[ 3];p[ 1]=ROL2(p[ 1],33) ^ p[ 2];p[ 7]=ROL2(p[ 7],27) ^ p[ 4];p[ 5]=ROL2(p[ 5],14) ^ p[ 6];p[ 3]=ROL2(p[ 3],42) ^ p[ 0];\ + p[ 4]+=p[ 1];p[ 6]+=p[ 3];p[ 0]+=p[ 5];p[ 2]+=p[ 7];p[ 1]=ROL2(p[ 1],17) ^ p[ 4];p[ 3]=ROL2(p[ 3],49) ^ p[ 6];p[ 5]=ROL2(p[ 5],36) ^ p[ 0];p[ 7]=ROL2(p[ 7],39) ^ p[ 2];\ + p[ 6]+=p[ 1];p[ 0]+=p[ 7];p[ 2]+=p[ 5];p[ 4]+=p[ 3];p[ 1]=ROL2(p[ 1],44) ^ p[ 6];p[ 7]=ROL2(p[ 7], 9) ^ p[ 0];p[ 5]=ROL2(p[ 5],54) ^ p[ 2];p[ 3]=ROR8(p[ 3]) ^ p[ 4];\ +} +#define TFBIGMIX8o(){\ + p[ 0]+=p[ 1];p[ 2]+=p[ 3];p[ 4]+=p[ 5];p[ 6]+=p[ 7];p[ 1]=ROL2(p[ 1],39) ^ p[ 0];p[ 3]=ROL2(p[ 3],30) ^ p[ 2];p[ 5]=ROL2(p[ 5],34) ^ p[ 4];p[ 7]=ROL24(p[ 7]) ^ p[ 6];\ + p[ 2]+=p[ 1];p[ 4]+=p[ 7];p[ 6]+=p[ 5];p[ 0]+=p[ 3];p[ 1]=ROL2(p[ 1],13) ^ p[ 2];p[ 7]=ROL2(p[ 7],50) ^ p[ 4];p[ 5]=ROL2(p[ 5],10) ^ p[ 6];p[ 3]=ROL2(p[ 3],17) ^ p[ 0];\ + p[ 4]+=p[ 1];p[ 6]+=p[ 3];p[ 0]+=p[ 5];p[ 2]+=p[ 7];p[ 1]=ROL2(p[ 1],25) ^ p[ 4];p[ 3]=ROL2(p[ 3],29) ^ p[ 6];p[ 5]=ROL2(p[ 5],39) ^ p[ 0];p[ 7]=ROL2(p[ 7],43) ^ p[ 2];\ + p[ 6]+=p[ 1];p[ 0]+=p[ 7];p[ 2]+=p[ 5];p[ 4]+=p[ 3];p[ 1]=ROL8(p[ 1]) ^ p[ 6];p[ 7]=ROL2(p[ 7],35) ^ p[ 0];p[ 5]=ROR8(p[ 5]) ^ p[ 2];p[ 3]=ROL2(p[ 3],22) ^ p[ 4];\ +} + +#define addwBuff(x0,x1,x2,x3,x4){\ + p[ 0]+=h[x0];\ + p[ 1]+=h[x1];\ + p[ 2]+=h[x2];\ + p[ 3]+=h[x3];\ + p[ 4]+=h[x4];\ + p[ 5]+=c_buffer[i++];\ + p[ 7]+=c_buffer[i++];\ + p[ 6]+=c_buffer[i];\ +} + +#define addwCon(x0,x1,x2,x3,x4,x5,x6,x7,y0,y1,y2){\ + p[ 0]+= h[x0];\ + p[ 1]+= h[x1];\ + p[ 2]+= h[x2];\ + p[ 3]+= h[x3];\ + p[ 4]+= h[x4];\ + p[ 5]+= h[x5] + c_t[y0];\ + p[ 6]+= h[x6] + c_t[y1];\ + p[ 7]+= h[x7] + c_add[y2];\ +} + + diff --git a/streebog/sib.cu b/streebog/sib.cu index 24de940b7f..31176d1523 100644 --- a/streebog/sib.cu +++ b/streebog/sib.cu @@ -1,15 +1,9 @@ extern "C" { -#include "sph/sph_blake.h" -#include "sph/sph_bmw.h" -#include "sph/sph_groestl.h" #include "sph/sph_skein.h" -#include "sph/sph_jh.h" -#include "sph/sph_keccak.h" #include "sph/sph_luffa.h" -#include "sph/sph_cubehash.h" -#include "sph/sph_shavite.h" -#include "sph/sph_simd.h" #include "sph/sph_echo.h" +#include "sph/sph_fugue.h" +#include "sph/sph_shabal.h" #include "sph/sph_streebog.h" } @@ -17,94 +11,76 @@ extern "C" { #include "cuda_helper.h" #include "x11/cuda_x11.h" -extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash); -extern void keccak_streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash); -extern void keccak_streebog_luffa_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash); - #include #include #define NBN 2 + static uint32_t *d_hash[MAX_GPUS]; static uint32_t *d_resNonce[MAX_GPUS]; static uint32_t *h_resNonce[MAX_GPUS]; +extern void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash,uint32_t* d_resNonce); +extern void streebog_set_target(const uint32_t* ptarget); + +extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash); +//extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads,uint32_t *d_hash); +extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash); +extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads); + +extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads,uint32_t *d_hash); +extern void x11_luffa512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash); + +extern void skein512_cpu_setBlock_80(void *pdata); +extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); + // Sibcoin CPU Hash -extern "C" void sibhash(void *output, const void *input) +extern "C" void polyhash(void *output, const void *input) { unsigned char _ALIGN(128) hash[128] = { 0 }; - sph_blake512_context ctx_blake; - sph_bmw512_context ctx_bmw; - sph_groestl512_context ctx_groestl; sph_skein512_context ctx_skein; - sph_jh512_context ctx_jh; - sph_keccak512_context ctx_keccak; - sph_gost512_context ctx_gost; - sph_luffa512_context ctx_luffa; - sph_cubehash512_context ctx_cubehash; - sph_shavite512_context ctx_shavite; - sph_simd512_context ctx_simd; + sph_shabal512_context ctx_shabal; sph_echo512_context ctx_echo; + sph_luffa512_context ctx_luffa; + sph_fugue512_context ctx_fugue; + sph_gost512_context ctx_gost; - sph_blake512_init(&ctx_blake); - sph_blake512 (&ctx_blake, input, 80); - sph_blake512_close(&ctx_blake, (void*) hash); - sph_bmw512_init(&ctx_bmw); - sph_bmw512 (&ctx_bmw, (const void*) hash, 64); - sph_bmw512_close(&ctx_bmw, (void*) hash); + sph_skein512_init(&ctx_skein); + sph_skein512 (&ctx_skein, input, 80); + sph_skein512_close(&ctx_skein, (void*) hash); - sph_groestl512_init(&ctx_groestl); - sph_groestl512 (&ctx_groestl, (const void*) hash, 64); - sph_groestl512_close(&ctx_groestl, (void*) hash); + sph_shabal512_init(&ctx_shabal); + sph_shabal512 (&ctx_shabal, (const void*) hash, 64); + sph_shabal512_close(&ctx_shabal, (void*) hash); - sph_skein512_init(&ctx_skein); - sph_skein512 (&ctx_skein, (const void*) hash, 64); - sph_skein512_close(&ctx_skein, (void*) hash); + sph_echo512_init(&ctx_echo); + sph_echo512 (&ctx_echo, (const void*) hash, 64); + sph_echo512_close(&ctx_echo, (void*) hash); - sph_jh512_init(&ctx_jh); - sph_jh512 (&ctx_jh, (const void*) hash, 64); - sph_jh512_close(&ctx_jh, (void*) hash); + sph_luffa512_init(&ctx_luffa); + sph_luffa512 (&ctx_luffa, (const void*) hash, 64); + sph_luffa512_close (&ctx_luffa, (void*) hash); - sph_keccak512_init(&ctx_keccak); - sph_keccak512 (&ctx_keccak, (const void*) hash, 64); - sph_keccak512_close(&ctx_keccak, (void*) hash); + sph_fugue512_init(&ctx_fugue); + sph_fugue512(&ctx_fugue, (const void*) hash, 64); + sph_fugue512_close(&ctx_fugue, (void*) hash); sph_gost512_init(&ctx_gost); sph_gost512(&ctx_gost, (const void*) hash, 64); sph_gost512_close(&ctx_gost, (void*) hash); - sph_luffa512_init(&ctx_luffa); - sph_luffa512 (&ctx_luffa, (const void*) hash, 64); - sph_luffa512_close (&ctx_luffa, (void*) hash); - - sph_cubehash512_init(&ctx_cubehash); - sph_cubehash512 (&ctx_cubehash, (const void*) hash, 64); - sph_cubehash512_close(&ctx_cubehash, (void*) hash); - - sph_shavite512_init(&ctx_shavite); - sph_shavite512 (&ctx_shavite, (const void*) hash, 64); - sph_shavite512_close(&ctx_shavite, (void*) hash); - - sph_simd512_init(&ctx_simd); - sph_simd512 (&ctx_simd, (const void*) hash, 64); - sph_simd512_close(&ctx_simd, (void*) hash); - - sph_echo512_init(&ctx_echo); - sph_echo512 (&ctx_echo, (const void*) hash, 64); - sph_echo512_close(&ctx_echo, (void*) hash); - memcpy(output, hash, 32); } //#define _DEBUG -#define _DEBUG_PREFIX "sib" +#define _DEBUG_PREFIX "poly" #include "cuda_debug.cuh" static bool init[MAX_GPUS] = { 0 }; -extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) +extern "C" int scanhash_poly(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) { int dev_id = device_map[thr_id]; @@ -122,10 +98,8 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u uint32_t throughput = cuda_default_throughput(thr_id, default_throughput); // 19=256*256*8; if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); - throughput&=0xFFFFFF70; //multiples of 128 due to keccak_streebog_luffa - - if (opt_benchmark) - ptarget[7] = 0xf; + if (opt_benchmark) + ptarget[7] = 0xf; if (!init[thr_id]) { @@ -139,8 +113,6 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u gpulog(LOG_INFO,thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); - x11_simd_echo_512_cpu_init(thr_id, throughput); - CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 8 * sizeof(uint64_t) * throughput)); CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t))); h_resNonce[thr_id] = (uint32_t*) malloc(NBN * sizeof(uint32_t)); @@ -148,34 +120,28 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u gpulog(LOG_ERR,thr_id,"Host memory allocation failed"); exit(EXIT_FAILURE); } - + x13_fugue512_cpu_init(thr_id, throughput); init[thr_id] = true; } - + uint32_t endiandata[20]; for (int k=0; k < 20; k++) be32enc(&endiandata[k], pdata[k]); - quark_blake512_cpu_setBlock_80(thr_id, endiandata); - + skein512_cpu_setBlock_80(endiandata); cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)); + streebog_set_target(ptarget); do { // Hash with CUDA - quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); - quark_bmw512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); - quark_groestl512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); - quark_skein512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); - quark_jh512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); - -// quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); -// streebog_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); - keccak_streebog_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); + + skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); + x14_shabal512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); + x11_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); x11_luffa512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); - x11_cubehash_shavite512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); -// x11_luffaCubehashShavite512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); - x11_simd_echo512_cpu_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id], *(uint64_t*)&ptarget[6]); + x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id]); + streebog_cpu_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]); cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost); @@ -184,9 +150,9 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u const uint32_t startNounce = pdata[19]; uint32_t vhash64[8]; be32enc(&endiandata[19], startNounce + h_resNonce[thr_id][0]); - sibhash(vhash64, endiandata); + polyhash(vhash64, endiandata); - if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { + if (vhash64[7] <= Htarg) { int res = 1; *hashes_done = pdata[19] - first_nonce + throughput; work_set_target_ratio(work, vhash64); @@ -196,7 +162,7 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u // if(!opt_quiet) // gpulog(LOG_BLUE,dev_id,"Found 2nd nonce: %08x", pdata[21]); be32enc(&endiandata[19], startNounce+h_resNonce[thr_id][1]); - sibhash(vhash64, endiandata); + polyhash(vhash64, endiandata); if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio[0]){ work_set_target_ratio(work, vhash64); xchg(pdata[19],pdata[21]); @@ -221,7 +187,7 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u } // cleanup -extern "C" void free_sib(int thr_id) +extern "C" void free_poly(int thr_id) { if (!init[thr_id]) return; @@ -231,8 +197,6 @@ extern "C" void free_sib(int thr_id) free(h_resNonce[thr_id]); cudaFree(d_resNonce[thr_id]); cudaFree(d_hash[thr_id]); - - x11_simd_echo_512_cpu_free(thr_id); init[thr_id] = false; diff --git a/streebog/veltor.cu b/streebog/veltor.cu index c288b4f3d7..ea1bd1c76d 100644 --- a/streebog/veltor.cu +++ b/streebog/veltor.cu @@ -95,14 +95,14 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce init[thr_id] = true; } - uint32_t endiandata[20]; - for (int k=0; k < 20; k++) - be32enc(&endiandata[k], pdata[k]); + uint32_t endiandata[20]; + for (int k=0; k < 20; k++) + be32enc(&endiandata[k], pdata[k]); - skein512_cpu_setBlock_80(endiandata); + skein512_cpu_setBlock_80(endiandata); - cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)); - streebog_set_target(ptarget); + cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)); + streebog_set_target(ptarget); do { skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); diff --git a/util.cpp b/util.cpp index 146e89d9a1..5ef1ff358a 100644 --- a/util.cpp +++ b/util.cpp @@ -2182,7 +2182,7 @@ void print_hash_tests(void){ c11hash(&hash[0], &buf[0]); printpfx("c11", hash); - sibhash(&hash[0], &buf[0]); + polyhash(&hash[0], &buf[0]); printpfx("sib", hash); x13hash(&hash[0], &buf[0]); diff --git a/x13/cuda_x13_fugue512.cu b/x13/cuda_x13_fugue512.cu index b3d708a226..43bb744a15 100644 --- a/x13/cuda_x13_fugue512.cu +++ b/x13/cuda_x13_fugue512.cu @@ -1,32 +1,26 @@ + +#include + +#define TPB 256 + /* - * Quick and dirty addition of Fugue-512 for X13 - * - * Built on cbuchner1's implementation, actual hashing code - * heavily based on phm's sgminer - * - * - */ -#include "cuda_helper.h" -#include "miner.h" -#include "cuda_vectors.h" -/* - * X13 kernel implementation. + * fugue512 x13 kernel implementation. * * ==========================(LICENSE BEGIN)============================ * - * Copyright (c) 2014-2016 phm, Provos Alexis - * + * Copyright (c) 2014-2017 phm, tpruvot + * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the - * "Software", to deal in the Software without restriction, including + * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: - * + * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. - * + * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. @@ -36,373 +30,356 @@ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * * ===========================(LICENSE END)============================= - * - * @author phm - * @author Provos Alexis (Applied partial shared Mem utilization under CUDA 7.5 for compute5.0/5.2 / 2016) */ -static __constant__ const uint32_t c_S[16] = { - 0x8807a57e, 0xe616af75, 0xc5d3e4db, 0xac9ab027, - 0xd915f117, 0xb6eecc54, 0x06e8020b, 0x4a92efd1, - 0xaac6e2c9, 0xddb21398, 0xcae65838, 0x437f203f, - 0x25ea78e7, 0x951fddd6, 0xda6ed11d, 0xe13e3567 +#ifdef __INTELLISENSE__ +#define __byte_perm(x, y, m) (x|y) +#define tex1Dfetch(t, n) (n) +#define __CUDACC__ +#include +#endif + +// store allocated textures device addresses +static unsigned int* d_textures[MAX_GPUS][1]; + +#define mixtab0(x) mixtabs[(x)] +#define mixtab1(x) mixtabs[(x)+256] +#define mixtab2(x) mixtabs[(x)+512] +#define mixtab3(x) mixtabs[(x)+768] + +static texture mixTab0Tex; + +static const uint32_t mixtab0[] = { + 0x63633297, 0x7c7c6feb, 0x77775ec7, 0x7b7b7af7, 0xf2f2e8e5, 0x6b6b0ab7, 0x6f6f16a7, 0xc5c56d39, + 0x303090c0, 0x01010704, 0x67672e87, 0x2b2bd1ac, 0xfefeccd5, 0xd7d71371, 0xabab7c9a, 0x767659c3, + 0xcaca4005, 0x8282a33e, 0xc9c94909, 0x7d7d68ef, 0xfafad0c5, 0x5959947f, 0x4747ce07, 0xf0f0e6ed, + 0xadad6e82, 0xd4d41a7d, 0xa2a243be, 0xafaf608a, 0x9c9cf946, 0xa4a451a6, 0x727245d3, 0xc0c0762d, + 0xb7b728ea, 0xfdfdc5d9, 0x9393d47a, 0x2626f298, 0x363682d8, 0x3f3fbdfc, 0xf7f7f3f1, 0xcccc521d, + 0x34348cd0, 0xa5a556a2, 0xe5e58db9, 0xf1f1e1e9, 0x71714cdf, 0xd8d83e4d, 0x313197c4, 0x15156b54, + 0x04041c10, 0xc7c76331, 0x2323e98c, 0xc3c37f21, 0x18184860, 0x9696cf6e, 0x05051b14, 0x9a9aeb5e, + 0x0707151c, 0x12127e48, 0x8080ad36, 0xe2e298a5, 0xebeba781, 0x2727f59c, 0xb2b233fe, 0x757550cf, + 0x09093f24, 0x8383a43a, 0x2c2cc4b0, 0x1a1a4668, 0x1b1b416c, 0x6e6e11a3, 0x5a5a9d73, 0xa0a04db6, + 0x5252a553, 0x3b3ba1ec, 0xd6d61475, 0xb3b334fa, 0x2929dfa4, 0xe3e39fa1, 0x2f2fcdbc, 0x8484b126, + 0x5353a257, 0xd1d10169, 0x00000000, 0xededb599, 0x2020e080, 0xfcfcc2dd, 0xb1b13af2, 0x5b5b9a77, + 0x6a6a0db3, 0xcbcb4701, 0xbebe17ce, 0x3939afe4, 0x4a4aed33, 0x4c4cff2b, 0x5858937b, 0xcfcf5b11, + 0xd0d0066d, 0xefefbb91, 0xaaaa7b9e, 0xfbfbd7c1, 0x4343d217, 0x4d4df82f, 0x333399cc, 0x8585b622, + 0x4545c00f, 0xf9f9d9c9, 0x02020e08, 0x7f7f66e7, 0x5050ab5b, 0x3c3cb4f0, 0x9f9ff04a, 0xa8a87596, + 0x5151ac5f, 0xa3a344ba, 0x4040db1b, 0x8f8f800a, 0x9292d37e, 0x9d9dfe42, 0x3838a8e0, 0xf5f5fdf9, + 0xbcbc19c6, 0xb6b62fee, 0xdada3045, 0x2121e784, 0x10107040, 0xffffcbd1, 0xf3f3efe1, 0xd2d20865, + 0xcdcd5519, 0x0c0c2430, 0x1313794c, 0xececb29d, 0x5f5f8667, 0x9797c86a, 0x4444c70b, 0x1717655c, + 0xc4c46a3d, 0xa7a758aa, 0x7e7e61e3, 0x3d3db3f4, 0x6464278b, 0x5d5d886f, 0x19194f64, 0x737342d7, + 0x60603b9b, 0x8181aa32, 0x4f4ff627, 0xdcdc225d, 0x2222ee88, 0x2a2ad6a8, 0x9090dd76, 0x88889516, + 0x4646c903, 0xeeeebc95, 0xb8b805d6, 0x14146c50, 0xdede2c55, 0x5e5e8163, 0x0b0b312c, 0xdbdb3741, + 0xe0e096ad, 0x32329ec8, 0x3a3aa6e8, 0x0a0a3628, 0x4949e43f, 0x06061218, 0x2424fc90, 0x5c5c8f6b, + 0xc2c27825, 0xd3d30f61, 0xacac6986, 0x62623593, 0x9191da72, 0x9595c662, 0xe4e48abd, 0x797974ff, + 0xe7e783b1, 0xc8c84e0d, 0x373785dc, 0x6d6d18af, 0x8d8d8e02, 0xd5d51d79, 0x4e4ef123, 0xa9a97292, + 0x6c6c1fab, 0x5656b943, 0xf4f4fafd, 0xeaeaa085, 0x6565208f, 0x7a7a7df3, 0xaeae678e, 0x08083820, + 0xbaba0bde, 0x787873fb, 0x2525fb94, 0x2e2ecab8, 0x1c1c5470, 0xa6a65fae, 0xb4b421e6, 0xc6c66435, + 0xe8e8ae8d, 0xdddd2559, 0x747457cb, 0x1f1f5d7c, 0x4b4bea37, 0xbdbd1ec2, 0x8b8b9c1a, 0x8a8a9b1e, + 0x70704bdb, 0x3e3ebaf8, 0xb5b526e2, 0x66662983, 0x4848e33b, 0x0303090c, 0xf6f6f4f5, 0x0e0e2a38, + 0x61613c9f, 0x35358bd4, 0x5757be47, 0xb9b902d2, 0x8686bf2e, 0xc1c17129, 0x1d1d5374, 0x9e9ef74e, + 0xe1e191a9, 0xf8f8decd, 0x9898e556, 0x11117744, 0x696904bf, 0xd9d93949, 0x8e8e870e, 0x9494c166, + 0x9b9bec5a, 0x1e1e5a78, 0x8787b82a, 0xe9e9a989, 0xcece5c15, 0x5555b04f, 0x2828d8a0, 0xdfdf2b51, + 0x8c8c8906, 0xa1a14ab2, 0x89899212, 0x0d0d2334, 0xbfbf10ca, 0xe6e684b5, 0x4242d513, 0x686803bb, + 0x4141dc1f, 0x9999e252, 0x2d2dc3b4, 0x0f0f2d3c, 0xb0b03df6, 0x5454b74b, 0xbbbb0cda, 0x16166258 }; -static __device__ uint32_t mixtab0[256] = { - 0x63633297, 0x7c7c6feb, 0x77775ec7, 0x7b7b7af7, 0xf2f2e8e5, 0x6b6b0ab7, 0x6f6f16a7, 0xc5c56d39, 0x303090c0, 0x01010704, 0x67672e87, 0x2b2bd1ac, 0xfefeccd5, 0xd7d71371, 0xabab7c9a, - 0x767659c3, 0xcaca4005, 0x8282a33e, 0xc9c94909, 0x7d7d68ef, 0xfafad0c5, 0x5959947f, 0x4747ce07, 0xf0f0e6ed, 0xadad6e82, 0xd4d41a7d, 0xa2a243be, 0xafaf608a, 0x9c9cf946, 0xa4a451a6, - 0x727245d3, 0xc0c0762d, 0xb7b728ea, 0xfdfdc5d9, 0x9393d47a, 0x2626f298, 0x363682d8, 0x3f3fbdfc, 0xf7f7f3f1, 0xcccc521d, 0x34348cd0, 0xa5a556a2, 0xe5e58db9, 0xf1f1e1e9, 0x71714cdf, - 0xd8d83e4d, 0x313197c4, 0x15156b54, 0x04041c10, 0xc7c76331, 0x2323e98c, 0xc3c37f21, 0x18184860, 0x9696cf6e, 0x05051b14, 0x9a9aeb5e, 0x0707151c, 0x12127e48, 0x8080ad36, 0xe2e298a5, - 0xebeba781, 0x2727f59c, 0xb2b233fe, 0x757550cf, 0x09093f24, 0x8383a43a, 0x2c2cc4b0, 0x1a1a4668, 0x1b1b416c, 0x6e6e11a3, 0x5a5a9d73, 0xa0a04db6, 0x5252a553, 0x3b3ba1ec, 0xd6d61475, - 0xb3b334fa, 0x2929dfa4, 0xe3e39fa1, 0x2f2fcdbc, 0x8484b126, 0x5353a257, 0xd1d10169, 0x00000000, 0xededb599, 0x2020e080, 0xfcfcc2dd, 0xb1b13af2, 0x5b5b9a77, 0x6a6a0db3, 0xcbcb4701, - 0xbebe17ce, 0x3939afe4, 0x4a4aed33, 0x4c4cff2b, 0x5858937b, 0xcfcf5b11, 0xd0d0066d, 0xefefbb91, 0xaaaa7b9e, 0xfbfbd7c1, 0x4343d217, 0x4d4df82f, 0x333399cc, 0x8585b622, 0x4545c00f, - 0xf9f9d9c9, 0x02020e08, 0x7f7f66e7, 0x5050ab5b, 0x3c3cb4f0, 0x9f9ff04a, 0xa8a87596, 0x5151ac5f, 0xa3a344ba, 0x4040db1b, 0x8f8f800a, 0x9292d37e, 0x9d9dfe42, 0x3838a8e0, 0xf5f5fdf9, - 0xbcbc19c6, 0xb6b62fee, 0xdada3045, 0x2121e784, 0x10107040, 0xffffcbd1, 0xf3f3efe1, 0xd2d20865, 0xcdcd5519, 0x0c0c2430, 0x1313794c, 0xececb29d, 0x5f5f8667, 0x9797c86a, 0x4444c70b, - 0x1717655c, 0xc4c46a3d, 0xa7a758aa, 0x7e7e61e3, 0x3d3db3f4, 0x6464278b, 0x5d5d886f, 0x19194f64, 0x737342d7, 0x60603b9b, 0x8181aa32, 0x4f4ff627, 0xdcdc225d, 0x2222ee88, 0x2a2ad6a8, - 0x9090dd76, 0x88889516, 0x4646c903, 0xeeeebc95, 0xb8b805d6, 0x14146c50, 0xdede2c55, 0x5e5e8163, 0x0b0b312c, 0xdbdb3741, 0xe0e096ad, 0x32329ec8, 0x3a3aa6e8, 0x0a0a3628, 0x4949e43f, - 0x06061218, 0x2424fc90, 0x5c5c8f6b, 0xc2c27825, 0xd3d30f61, 0xacac6986, 0x62623593, 0x9191da72, 0x9595c662, 0xe4e48abd, 0x797974ff, 0xe7e783b1, 0xc8c84e0d, 0x373785dc, 0x6d6d18af, - 0x8d8d8e02, 0xd5d51d79, 0x4e4ef123, 0xa9a97292, 0x6c6c1fab, 0x5656b943, 0xf4f4fafd, 0xeaeaa085, 0x6565208f, 0x7a7a7df3, 0xaeae678e, 0x08083820, 0xbaba0bde, 0x787873fb, 0x2525fb94, - 0x2e2ecab8, 0x1c1c5470, 0xa6a65fae, 0xb4b421e6, 0xc6c66435, 0xe8e8ae8d, 0xdddd2559, 0x747457cb, 0x1f1f5d7c, 0x4b4bea37, 0xbdbd1ec2, 0x8b8b9c1a, 0x8a8a9b1e, 0x70704bdb, 0x3e3ebaf8, - 0xb5b526e2, 0x66662983, 0x4848e33b, 0x0303090c, 0xf6f6f4f5, 0x0e0e2a38, 0x61613c9f, 0x35358bd4, 0x5757be47, 0xb9b902d2, 0x8686bf2e, 0xc1c17129, 0x1d1d5374, 0x9e9ef74e, 0xe1e191a9, - 0xf8f8decd, 0x9898e556, 0x11117744, 0x696904bf, 0xd9d93949, 0x8e8e870e, 0x9494c166, 0x9b9bec5a, 0x1e1e5a78, 0x8787b82a, 0xe9e9a989, 0xcece5c15, 0x5555b04f, 0x2828d8a0, 0xdfdf2b51, - 0x8c8c8906, 0xa1a14ab2, 0x89899212, 0x0d0d2334, 0xbfbf10ca, 0xe6e684b5, 0x4242d513, 0x686803bb, 0x4141dc1f, 0x9999e252, 0x2d2dc3b4, 0x0f0f2d3c, 0xb0b03df6, 0x5454b74b, 0xbbbb0cda, - 0x16166258 -}; - -#define mixtab0(x) shared[0][x] -#define mixtab1(x) shared[1][x] -#define mixtab2(x) shared[2][x] -#define mixtab3(x) shared[3][x] - #define TIX4(q, x00, x01, x04, x07, x08, x22, x24, x27, x30) { \ - x22 ^= x00; \ - x00 = (q); \ - x08 ^= (q); \ - x01 ^= x24; \ - x04 ^= x27; \ - x07 ^= x30; \ - } + x22 ^= x00; \ + x00 = (q); \ + x08 ^= x00; \ + x01 ^= x24; \ + x04 ^= x27; \ + x07 ^= x30; \ +} #define CMIX36(x00, x01, x02, x04, x05, x06, x18, x19, x20) { \ - x00 ^= x04; \ - x01 ^= x05; \ - x02 ^= x06; \ - x18 ^= x04; \ - x19 ^= x05; \ - x20 ^= x06; \ - } + x00 ^= x04; \ + x01 ^= x05; \ + x02 ^= x06; \ + x18 ^= x04; \ + x19 ^= x05; \ + x20 ^= x06; \ +} -__device__ __forceinline__ -static void SMIX(const uint32_t shared[4][256], uint32_t &x0,uint32_t &x1,uint32_t &x2,uint32_t &x3){ - uint32_t c0 = mixtab0(__byte_perm(x0,0,0x4443)); - uint32_t r1 = mixtab1(__byte_perm(x0,0,0x4442)); - uint32_t r2 = mixtab2(__byte_perm(x0,0,0x4441)); - uint32_t r3 = mixtab3(__byte_perm(x0,0,0x4440)); - c0 = c0 ^ r1 ^ r2 ^ r3; - uint32_t r0 = mixtab0(__byte_perm(x1,0,0x4443)); - uint32_t c1 = r0 ^ mixtab1(__byte_perm(x1,0,0x4442)); - uint32_t tmp = mixtab2(__byte_perm(x1,0,0x4441)); - c1 ^= tmp; - r2 ^= tmp; - tmp = mixtab3(__byte_perm(x1,0,0x4440)); - c1 ^= tmp; - r3 ^= tmp; - uint32_t c2 = mixtab0(__byte_perm(x2,0,0x4443)); - r0 ^= c2; - tmp = mixtab1(__byte_perm(x2,0,0x4442)); - c2 ^= tmp; - r1 ^= tmp; - tmp = mixtab2(__byte_perm(x2,0,0x4441)); - c2 ^= tmp; - tmp = mixtab3(__byte_perm(x2,0,0x4440)); - c2 ^= tmp; - r3 ^= tmp; - uint32_t c3 = mixtab0(__byte_perm(x3,0,0x4443)); - r0 ^= c3; - tmp = mixtab1(__byte_perm(x3,0,0x4442)); - c3 ^= tmp; - r1 ^= tmp; - tmp = mixtab2(__byte_perm(x3,0,0x4441)); - c3 ^= tmp; - r2 ^= tmp; - tmp = mixtab3(__byte_perm(x3,0,0x4440)); - c3 ^= tmp; - x0 = ((c0 ^ (r0 << 0)) & 0xFF000000) | ((c1 ^ (r1 << 0)) & 0x00FF0000) | ((c2 ^ (r2 << 0)) & 0x0000FF00) | ((c3 ^ (r3 << 0)) & 0x000000FF); - x1 = ((c1 ^ (r0 << 8)) & 0xFF000000) | ((c2 ^ (r1 << 8)) & 0x00FF0000) | ((c3 ^ (r2 << 8)) & 0x0000FF00) | ((c0 ^ (r3 >>24)) & 0x000000FF); - x2 = ((c2 ^ (r0 <<16)) & 0xFF000000) | ((c3 ^ (r1 <<16)) & 0x00FF0000) | ((c0 ^ (r2 >>16)) & 0x0000FF00) | ((c1 ^ (r3 >>16)) & 0x000000FF); - x3 = ((c3 ^ (r0 <<24)) & 0xFF000000) | ((c0 ^ (r1 >> 8)) & 0x00FF0000) | ((c1 ^ (r2 >> 8)) & 0x0000FF00) | ((c2 ^ (r3 >> 8)) & 0x000000FF); +#define SMIX(x0, x1, x2, x3) { \ + uint32_t tmp; \ + uint32_t r0 = 0; \ + uint32_t r1 = 0; \ + uint32_t r2 = 0; \ + uint32_t r3 = 0; \ + uint32_t c0 = mixtab0(x0 >> 24); \ + tmp = mixtab1((x0 >> 16) & 0xFF); \ + c0 ^= tmp; \ + r1 ^= tmp; \ + tmp = mixtab2((x0 >> 8) & 0xFF); \ + c0 ^= tmp; \ + r2 ^= tmp; \ + tmp = mixtab3(x0 & 0xFF); \ + c0 ^= tmp; \ + r3 ^= tmp; \ + tmp = mixtab0(x1 >> 24); \ + uint32_t c1 = tmp; \ + r0 ^= tmp; \ + tmp = mixtab1((x1 >> 16) & 0xFF); \ + c1 ^= tmp; \ + tmp = mixtab2((x1 >> 8) & 0xFF); \ + c1 ^= tmp; \ + r2 ^= tmp; \ + tmp = mixtab3(x1 & 0xFF); \ + c1 ^= tmp; \ + r3 ^= tmp; \ + tmp = mixtab0(x2 >> 24); \ + uint32_t c2 = tmp; \ + r0 ^= tmp; \ + tmp = mixtab1((x2 >> 16) & 0xFF); \ + c2 ^= tmp; \ + r1 ^= tmp; \ + tmp = mixtab2((x2 >> 8) & 0xFF); \ + c2 ^= tmp; \ + tmp = mixtab3(x2 & 0xFF); \ + c2 ^= tmp; \ + r3 ^= tmp; \ + tmp = mixtab0(x3 >> 24); \ + uint32_t c3 = tmp; \ + r0 ^= tmp; \ + tmp = mixtab1((x3 >> 16) & 0xFF); \ + c3 ^= tmp; \ + r1 ^= tmp; \ + tmp = mixtab2((x3 >> 8) & 0xFF); \ + c3 ^= tmp; \ + r2 ^= tmp; \ + tmp = mixtab3(x3 & 0xFF); \ + c3 ^= tmp; \ + x0 = ((c0 ^ r0) & 0xFF000000) | ((c1 ^ r1) & 0x00FF0000) \ + | ((c2 ^ r2) & 0x0000FF00) | ((c3 ^ r3) & 0x000000FF); \ + x1 = ((c1 ^ (r0 << 8)) & 0xFF000000) | ((c2 ^ (r1 << 8)) & 0x00FF0000) \ + | ((c3 ^ (r2 << 8)) & 0x0000FF00) | ((c0 ^ (r3 >> 24)) & 0x000000FF); \ + x2 = ((c2 ^ (r0 << 16)) & 0xFF000000) | ((c3 ^ (r1 << 16)) & 0x00FF0000) \ + | ((c0 ^ (r2 >> 16)) & 0x0000FF00) | ((c1 ^ (r3 >> 16)) & 0x000000FF); \ + x3 = ((c3 ^ (r0 << 24)) & 0xFF000000) | ((c0 ^ (r1 >> 8)) & 0x00FF0000) \ + | ((c1 ^ (r2 >> 8)) & 0x0000FF00) | ((c2 ^ (r3 >> 8)) & 0x000000FF); \ } -__device__ -static void SMIX_LDG(const uint32_t shared[4][256], uint32_t &x0,uint32_t &x1,uint32_t &x2,uint32_t &x3){ - uint32_t c0 = __ldg(&mixtab0[__byte_perm(x0,0,0x4443)]); - uint32_t r1 = mixtab1(__byte_perm(x0,0,0x4442)); - uint32_t r2 = mixtab2(__byte_perm(x0,0,0x4441)); - uint32_t r3 = mixtab3(__byte_perm(x0,0,0x4440)); - c0 = c0 ^ r1 ^ r2 ^ r3; - uint32_t r0 = __ldg(&mixtab0[__byte_perm(x1,0,0x4443)]); - uint32_t c1 = r0 ^ mixtab1(__byte_perm(x1,0,0x4442)); - uint32_t tmp = mixtab2(__byte_perm(x1,0,0x4441)); - c1 ^= tmp; - r2 ^= tmp; - tmp = mixtab3(__byte_perm(x1,0,0x4440)); - c1 ^= tmp; - r3 ^= tmp; - uint32_t c2 = __ldg(&mixtab0[__byte_perm(x2,0,0x4443)]); - r0 ^= c2; - tmp = mixtab1(__byte_perm(x2,0,0x4442)); - c2 ^= tmp; - r1 ^= tmp; - tmp = mixtab2(__byte_perm(x2,0,0x4441)); - c2 ^= tmp; - tmp = mixtab3(__byte_perm(x2,0,0x4440)); - c2 ^= tmp; - r3 ^= tmp; - uint32_t c3 = __ldg(&mixtab0[__byte_perm(x3,0,0x4443)]); - r0 ^= c3; - tmp = mixtab1(__byte_perm(x3,0,0x4442)); - c3 ^= tmp; - r1 ^= tmp; - tmp = mixtab2(__byte_perm(x3,0,0x4441)); - c3 ^= tmp; - r2 ^= tmp; - tmp = ROL8(__ldg(&mixtab0[__byte_perm(x3,0,0x4440)])); - c3 ^= tmp; - x0 = ((c0 ^ (r0 << 0)) & 0xFF000000) | ((c1 ^ (r1 << 0)) & 0x00FF0000) | ((c2 ^ (r2 << 0)) & 0x0000FF00) | ((c3 ^ (r3 << 0)) & 0x000000FF); - x1 = ((c1 ^ (r0 << 8)) & 0xFF000000) | ((c2 ^ (r1 << 8)) & 0x00FF0000) | ((c3 ^ (r2 << 8)) & 0x0000FF00) | ((c0 ^ (r3 >>24)) & 0x000000FF); - x2 = ((c2 ^ (r0 <<16)) & 0xFF000000) | ((c3 ^ (r1 <<16)) & 0x00FF0000) | ((c0 ^ (r2 >>16)) & 0x0000FF00) | ((c1 ^ (r3 >>16)) & 0x000000FF); - x3 = ((c3 ^ (r0 <<24)) & 0xFF000000) | ((c0 ^ (r1 >> 8)) & 0x00FF0000) | ((c1 ^ (r2 >> 8)) & 0x0000FF00) | ((c2 ^ (r3 >> 8)) & 0x000000FF); +#define SUB_ROR3 { \ + B33 = S33, B34 = S34, B35 = S35; \ + S35 = S32; S34 = S31; S33 = S30; S32 = S29; S31 = S28; S30 = S27; S29 = S26; S28 = S25; S27 = S24; \ + S26 = S23; S25 = S22; S24 = S21; S23 = S20; S22 = S19; S21 = S18; S20 = S17; S19 = S16; S18 = S15; \ + S17 = S14; S16 = S13; S15 = S12; S14 = S11; S13 = S10; S12 = S09; S11 = S08; S10 = S07; S09 = S06; \ + S08 = S05; S07 = S04; S06 = S03; S05 = S02; S04 = S01; S03 = S00; S02 = B35; S01 = B34; S00 = B33; \ } -#define mROR3 { \ - B[ 6] = S[33], B[ 7] = S[34], B[ 8] = S[35]; \ - S[35] = S[32]; S[34] = S[31]; S[33] = S[30]; S[32] = S[29]; S[31] = S[28]; S[30] = S[27]; S[29] = S[26]; S[28] = S[25]; S[27] = S[24]; \ - S[26] = S[23]; S[25] = S[22]; S[24] = S[21]; S[23] = S[20]; S[22] = S[19]; S[21] = S[18]; S[20] = S[17]; S[19] = S[16]; S[18] = S[15]; \ - S[17] = S[14]; S[16] = S[13]; S[15] = S[12]; S[14] = S[11]; S[13] = S[10]; S[12] = S[ 9]; S[11] = S[ 8]; S[10] = S[ 7]; S[ 9] = S[ 6]; \ - S[ 8] = S[ 5]; S[ 7] = S[ 4]; S[ 6] = S[ 3]; S[ 5] = S[ 2]; S[ 4] = S[ 1]; S[ 3] = S[ 0]; S[ 2] = B[ 8]; S[ 1] = B[ 7]; S[ 0] = B[ 6]; \ - } -#define mROR8 { \ - B[ 1] = S[28], B[ 2] = S[29], B[ 3] = S[30], B[ 4] = S[31], B[ 5] = S[32], B[ 6] = S[33], B[ 7] = S[34], B[ 8] = S[35]; \ - S[35] = S[27]; S[34] = S[26]; S[33] = S[25]; S[32] = S[24]; S[31] = S[23]; S[30] = S[22]; S[29] = S[21]; S[28] = S[20]; S[27] = S[19]; \ - S[26] = S[18]; S[25] = S[17]; S[24] = S[16]; S[23] = S[15]; S[22] = S[14]; S[21] = S[13]; S[20] = S[12]; S[19] = S[11]; S[18] = S[10]; \ - S[17] = S[ 9]; S[16] = S[ 8]; S[15] = S[ 7]; S[14] = S[ 6]; S[13] = S[ 5]; S[12] = S[ 4]; S[11] = S[ 3]; S[10] = S[ 2]; S[ 9] = S[ 1]; \ - S[ 8] = S[ 0]; S[ 7] = B[ 8]; S[ 6] = B[ 7]; S[ 5] = B[ 6]; S[ 4] = B[ 5]; S[ 3] = B[ 4]; S[ 2] = B[ 3]; S[ 1] = B[ 2]; S[ 0] = B[ 1]; \ - } +#define SUB_ROR8 { \ + B28 = S28, B29 = S29, B30 = S30, B31 = S31, B32 = S32, B33 = S33, B34 = S34, B35 = S35; \ + S35 = S27; S34 = S26; S33 = S25; S32 = S24; S31 = S23; S30 = S22; S29 = S21; S28 = S20; S27 = S19; \ + S26 = S18; S25 = S17; S24 = S16; S23 = S15; S22 = S14; S21 = S13; S20 = S12; S19 = S11; S18 = S10; \ + S17 = S09; S16 = S08; S15 = S07; S14 = S06; S13 = S05; S12 = S04; S11 = S03; S10 = S02; S09 = S01; \ + S08 = S00; S07 = B35; S06 = B34; S05 = B33; S04 = B32; S03 = B31; S02 = B30; S01 = B29; S00 = B28; \ +} -#define mROR9 { \ - B[ 0] = S[27], B[ 1] = S[28], B[ 2] = S[29], B[ 3] = S[30], B[ 4] = S[31], B[ 5] = S[32], B[ 6] = S[33], B[ 7] = S[34], B[ 8] = S[35]; \ - S[35] = S[26]; S[34] = S[25]; S[33] = S[24]; S[32] = S[23]; S[31] = S[22]; S[30] = S[21]; S[29] = S[20]; S[28] = S[19]; S[27] = S[18]; \ - S[26] = S[17]; S[25] = S[16]; S[24] = S[15]; S[23] = S[14]; S[22] = S[13]; S[21] = S[12]; S[20] = S[11]; S[19] = S[10]; S[18] = S[ 9]; \ - S[17] = S[ 8]; S[16] = S[ 7]; S[15] = S[ 6]; S[14] = S[ 5]; S[13] = S[ 4]; S[12] = S[ 3]; S[11] = S[ 2]; S[10] = S[ 1]; S[ 9] = S[ 0]; \ - S[ 8] = B[ 8]; S[ 7] = B[ 7]; S[ 6] = B[ 6]; S[ 5] = B[ 5]; S[ 4] = B[ 4]; S[ 3] = B[ 3]; S[ 2] = B[ 2]; S[ 1] = B[ 1]; S[ 0] = B[ 0]; \ - } +#define SUB_ROR9 { \ + B27 = S27, B28 = S28, B29 = S29, B30 = S30, B31 = S31, B32 = S32, B33 = S33, B34 = S34, B35 = S35; \ + S35 = S26; S34 = S25; S33 = S24; S32 = S23; S31 = S22; S30 = S21; S29 = S20; S28 = S19; S27 = S18; \ + S26 = S17; S25 = S16; S24 = S15; S23 = S14; S22 = S13; S21 = S12; S20 = S11; S19 = S10; S18 = S09; \ + S17 = S08; S16 = S07; S15 = S06; S14 = S05; S13 = S04; S12 = S03; S11 = S02; S10 = S01; S09 = S00; \ + S08 = B35; S07 = B34; S06 = B33; S05 = B32; S04 = B31; S03 = B30; S02 = B29; S01 = B28; S00 = B27; \ +} -#define FUGUE512_3(x, y, z) { \ - TIX4(x, S[ 0], S[ 1], S[ 4], S[ 7], S[ 8], S[22], S[24], S[27], S[30]); \ - CMIX36(S[33], S[34], S[35], S[ 1], S[ 2], S[ 3], S[15], S[16], S[17]); \ - SMIX_LDG(shared, S[33], S[34], S[35], S[ 0]); \ - CMIX36(S[30], S[31], S[32], S[34], S[35], S[ 0], S[12], S[13], S[14]); \ - SMIX_LDG(shared, S[30], S[31], S[32], S[33]); \ - CMIX36(S[27], S[28], S[29], S[31], S[32], S[33], S[ 9], S[10], S[11]); \ - SMIX(shared, S[27], S[28], S[29], S[30]); \ - CMIX36(S[24], S[25], S[26], S[28], S[29], S[30], S[ 6], S[ 7], S[ 8]); \ - SMIX_LDG(shared, S[24], S[25], S[26], S[27]); \ - \ - TIX4(y, S[24], S[25], S[28], S[31], S[32], S[10], S[12], S[15], S[18]); \ - CMIX36(S[21], S[22], S[23], S[25], S[26], S[27], S[ 3], S[ 4], S[ 5]); \ - SMIX(shared, S[21], S[22], S[23], S[24]); \ - CMIX36(S[18], S[19], S[20], S[22], S[23], S[24], S[ 0], S[ 1], S[ 2]); \ - SMIX_LDG(shared, S[18], S[19], S[20], S[21]); \ - CMIX36(S[15], S[16], S[17], S[19], S[20], S[21], S[33], S[34], S[35]); \ - SMIX_LDG(shared, S[15], S[16], S[17], S[18]); \ - CMIX36(S[12], S[13], S[14], S[16], S[17], S[18], S[30], S[31], S[32]); \ - SMIX(shared, S[12], S[13], S[14], S[15]); \ - \ - TIX4(z, S[12], S[13], S[16], S[19], S[20], S[34], S[ 0], S[ 3], S[ 6]); \ - CMIX36(S[ 9], S[10], S[11], S[13], S[14], S[15], S[27], S[28], S[29]); \ - SMIX_LDG(shared, S[ 9], S[10], S[11], S[12]); \ - CMIX36(S[ 6], S[ 7], S[ 8], S[10], S[11], S[12], S[24], S[25], S[26]); \ - SMIX_LDG(shared, S[ 6], S[ 7], S[ 8], S[ 9]); \ - CMIX36(S[ 3], S[ 4], S[ 5], S[ 7], S[ 8], S[ 9], S[21], S[22], S[23]); \ - SMIX_LDG(shared, S[ 3], S[ 4], S[ 5], S[ 6]); \ - CMIX36(S[ 0], S[ 1], S[ 2], S[ 4], S[ 5], S[ 6], S[18], S[19], S[20]); \ - SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]); \ - } +#define FUGUE512_3(x, y, z) { \ + TIX4(x, S00, S01, S04, S07, S08, S22, S24, S27, S30); \ + CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \ + SMIX(S33, S34, S35, S00); \ + CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \ + SMIX(S30, S31, S32, S33); \ + CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \ + SMIX(S27, S28, S29, S30); \ + CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \ + SMIX(S24, S25, S26, S27); \ + \ + TIX4(y, S24, S25, S28, S31, S32, S10, S12, S15, S18); \ + CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05); \ + SMIX(S21, S22, S23, S24); \ + CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02); \ + SMIX(S18, S19, S20, S21); \ + CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35); \ + SMIX(S15, S16, S17, S18); \ + CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32); \ + SMIX(S12, S13, S14, S15); \ + \ + TIX4(z, S12, S13, S16, S19, S20, S34, S00, S03, S06); \ + CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29); \ + SMIX(S09, S10, S11, S12); \ + CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26); \ + SMIX(S06, S07, S08, S09); \ + CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23); \ + SMIX(S03, S04, S05, S06); \ + CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); \ + SMIX(S00, S01, S02, S03); \ +} + + +#define AS_UINT4(addr) *((uint4*)(addr)) /***************************************************/ -// Die Hash-Funktion -__global__ __launch_bounds__(256,3) +__global__ +__launch_bounds__(TPB) void x13_fugue512_gpu_hash_64(uint32_t threads, uint64_t *g_hash) { - __shared__ uint32_t shared[4][256]; - -// if(threadIdx.x<256){ - const uint32_t tmp = mixtab0[threadIdx.x]; - shared[0][threadIdx.x] = tmp; - shared[1][threadIdx.x] = ROR8(tmp); - shared[2][threadIdx.x] = ROL16(tmp); - shared[3][threadIdx.x] = ROL8(tmp); -// } - __syncthreads(); - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t *hash = (uint32_t*)&g_hash[thread<<3]; - - uint32_t S[36]; - uint32_t B[ 9]; - - uint32_t Hash[16]; - - *(uint2x4*)&Hash[0] = __ldg4((uint2x4*)&hash[0]); - *(uint2x4*)&Hash[8] = __ldg4((uint2x4*)&hash[8]); - __syncthreads(); - - S[ 0] = S[ 1] = S[ 2] = S[ 3] = S[ 4] = S[ 5] = S[ 6] = S[ 7] = S[ 8] = S[ 9] = S[10] = S[11] = S[12] = S[13] = S[14] = S[15] = S[16] = S[17] = S[18] = S[19] = 0; - *(uint2x4*)&S[20] = *(uint2x4*)&c_S[ 0]; - *(uint2x4*)&S[28] = *(uint2x4*)&c_S[ 8]; - - FUGUE512_3(Hash[0x0], Hash[0x1], Hash[0x2]); - FUGUE512_3(Hash[0x3], Hash[0x4], Hash[0x5]); - FUGUE512_3(Hash[0x6], Hash[0x7], Hash[0x8]); - FUGUE512_3(Hash[0x9], Hash[0xA], Hash[0xB]); - FUGUE512_3(Hash[0xC], Hash[0xD], Hash[0xE]); - FUGUE512_3(Hash[0xF], 0U, 512U); - - for (uint32_t i = 0; i < 32; i+=2){ - mROR3; - CMIX36(S[ 0], S[ 1], S[ 2], S[ 4], S[ 5], S[ 6], S[18], S[19], S[20]); - SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]); - mROR3; - CMIX36(S[ 0], S[ 1], S[ 2], S[ 4], S[ 5], S[ 6], S[18], S[19], S[20]); - SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]); - } - #pragma unroll - for (uint32_t i = 0; i < 13; i ++) { - S[ 4] ^= S[ 0]; S[ 9] ^= S[ 0]; S[18] ^= S[ 0]; S[27] ^= S[ 0]; - mROR9; - SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]); - S[ 4] ^= S[ 0]; S[10] ^= S[ 0]; S[18] ^= S[ 0]; S[27] ^= S[ 0]; - mROR9; - SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]); - S[ 4] ^= S[ 0]; S[10] ^= S[ 0]; S[19] ^= S[ 0]; S[27] ^= S[ 0]; - mROR9; - SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]); - S[ 4] ^= S[ 0]; S[10] ^= S[ 0]; S[19] ^= S[ 0]; S[28] ^= S[ 0]; - mROR8; - SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]); - } - S[ 4] ^= S[ 0]; S[ 9] ^= S[ 0]; S[18] ^= S[ 0]; S[27] ^= S[ 0]; - - S[ 0] = cuda_swab32(S[ 1]); S[ 1] = cuda_swab32(S[ 2]); S[ 2] = cuda_swab32(S[ 3]); S[ 3] = cuda_swab32(S[ 4]); - S[ 4] = cuda_swab32(S[ 9]); S[ 5] = cuda_swab32(S[10]); S[ 6] = cuda_swab32(S[11]); S[ 7] = cuda_swab32(S[12]); - S[ 8] = cuda_swab32(S[18]); S[ 9] = cuda_swab32(S[19]); S[10] = cuda_swab32(S[20]); S[11] = cuda_swab32(S[21]); - S[12] = cuda_swab32(S[27]); S[13] = cuda_swab32(S[28]); S[14] = cuda_swab32(S[29]); S[15] = cuda_swab32(S[30]); - - *(uint2x4*)&hash[ 0] = *(uint2x4*)&S[ 0]; - *(uint2x4*)&hash[ 8] = *(uint2x4*)&S[ 8]; + __shared__ uint32_t mixtabs[1024]; + + // load shared mem (with 256 threads) + const uint32_t thr = threadIdx.x & 0xFF; + const uint32_t tmp = tex1Dfetch(mixTab0Tex, thr); + mixtabs[thr] = tmp; + mixtabs[thr+256] = ROR8(tmp); + mixtabs[thr+512] = ROL16(tmp); + mixtabs[thr+768] = ROL8(tmp); +#if TPB <= 256 + if (blockDim.x < 256) { + const uint32_t thr = (threadIdx.x + 0x80) & 0xFF; + const uint32_t tmp = tex1Dfetch(mixTab0Tex, thr); + mixtabs[thr] = tmp; + mixtabs[thr + 256] = ROR8(tmp); + mixtabs[thr + 512] = ROL16(tmp); + mixtabs[thr + 768] = ROL8(tmp); } -} +#endif -/***************************************************/ -// The final hash function -__global__ __launch_bounds__(512,2) /* force 56 registers */ -void x13_fugue512_gpu_hash_64_final(uint32_t threads,const uint32_t* __restrict__ g_hash,uint32_t* resNonce, const uint64_t target){ - - __shared__ uint32_t shared[4][256]; - - if(threadIdx.x<256){ - const uint32_t tmp = mixtab0[threadIdx.x]; - shared[0][threadIdx.x] = tmp; - shared[1][threadIdx.x] = ROR8(tmp); - shared[2][threadIdx.x] = ROL16(tmp); - shared[3][threadIdx.x] = ROL8(tmp); - } + __syncthreads(); - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - const uint32_t* __restrict__ hash = &g_hash[thread<<4]; - - uint32_t S[36]; - uint32_t B[ 9]; + const size_t hashPosition = thread; + uint64_t*pHash = &g_hash[hashPosition<<3]; uint32_t Hash[16]; - - *(uint2x4*)&Hash[0] = __ldg4((uint2x4*)&hash[0]); - *(uint2x4*)&Hash[8] = __ldg4((uint2x4*)&hash[8]); - __syncthreads(); - S[ 0] = S[ 1] = S[ 2] = S[ 3] = S[ 4] = S[ 5] = S[ 6] = S[ 7] = S[ 8] = S[ 9] = S[10] = S[11] = S[12] = S[13] = S[14] = S[15] = S[16] = S[17] = S[18] = S[19] = 0; - *(uint2x4*)&S[20] = *(uint2x4*)&c_S[ 0]; - *(uint2x4*)&S[28] = *(uint2x4*)&c_S[ 8]; - - FUGUE512_3(Hash[0x0], Hash[0x1], Hash[0x2]); - FUGUE512_3(Hash[0x3], Hash[0x4], Hash[0x5]); - FUGUE512_3(Hash[0x6], Hash[0x7], Hash[0x8]); - FUGUE512_3(Hash[0x9], Hash[0xA], Hash[0xB]); - FUGUE512_3(Hash[0xC], Hash[0xD], Hash[0xE]); - FUGUE512_3(Hash[0xF], 0, 512); - - for (int i = 0; i < 32; i++){ - mROR3; - CMIX36(S[ 0], S[ 1], S[ 2], S[ 4], S[ 5], S[ 6], S[18], S[19], S[20]); - SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]); - } - #pragma unroll - for (int i = 0; i < 12; i++) { - S[ 4] ^= S[ 0]; S[ 9] ^= S[ 0]; S[18] ^= S[ 0]; S[27] ^= S[ 0]; - mROR9; - SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]); - S[ 4] ^= S[ 0]; S[10] ^= S[ 0]; S[18] ^= S[ 0]; S[27] ^= S[ 0]; - mROR9; - SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]); - S[ 4] ^= S[ 0]; S[10] ^= S[ 0]; S[19] ^= S[ 0]; S[27] ^= S[ 0]; - mROR9; - SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]); - S[ 4] ^= S[ 0]; S[10] ^= S[ 0]; S[19] ^= S[ 0]; S[28] ^= S[ 0]; - mROR8; - SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]); + + #pragma unroll 4 + for(int i = 0; i < 4; i++) + AS_UINT4(&Hash[i*4]) = AS_UINT4(&pHash[i*2]); + + #pragma unroll 16 + for(int i = 0; i < 16; i++) + Hash[i] = cuda_swab32(Hash[i]); + + uint32_t S00, S01, S02, S03, S04, S05, S06, S07, S08, S09; + uint32_t S10, S11, S12, S13, S14, S15, S16, S17, S18, S19; + uint32_t S20, S21, S22, S23, S24, S25, S26, S27, S28, S29; + uint32_t S30, S31, S32, S33, S34, S35; + + uint32_t B27, B28, B29, B30, B31, B32, B33, B34, B35; + //const uint64_t bc = (64ULL << 3); // 512 + //const uint32_t bclo = (uint32_t)(bc); + //const uint32_t bchi = (uint32_t)(bc >> 32); + + S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = 0; + S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0; + S20 = 0x8807a57e; S21 = 0xe616af75; S22 = 0xc5d3e4db; S23 = 0xac9ab027; + S24 = 0xd915f117; S25 = 0xb6eecc54; S26 = 0x06e8020b; S27 = 0x4a92efd1; + S28 = 0xaac6e2c9; S29 = 0xddb21398; S30 = 0xcae65838; S31 = 0x437f203f; + S32 = 0x25ea78e7; S33 = 0x951fddd6; S34 = 0xda6ed11d; S35 = 0xe13e3567; + + FUGUE512_3((Hash[0x0]), (Hash[0x1]), (Hash[0x2])); + FUGUE512_3((Hash[0x3]), (Hash[0x4]), (Hash[0x5])); + FUGUE512_3((Hash[0x6]), (Hash[0x7]), (Hash[0x8])); + FUGUE512_3((Hash[0x9]), (Hash[0xA]), (Hash[0xB])); + FUGUE512_3((Hash[0xC]), (Hash[0xD]), (Hash[0xE])); + FUGUE512_3((Hash[0xF]), 0u /*bchi*/, 512u /*bclo*/); + + #pragma unroll 32 + for (int i = 0; i < 32; i ++) { + SUB_ROR3; + CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); + SMIX(S00, S01, S02, S03); } - S[ 4] ^= S[ 0]; S[ 9] ^= S[ 0]; S[18] ^= S[ 0]; S[27] ^= S[ 0]; - mROR9; - SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]); - S[ 4] ^= S[ 0]; S[10] ^= S[ 0]; S[18] ^= S[ 0]; S[27] ^= S[ 0]; - mROR9; - SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]); - S[ 4] ^= S[ 0]; S[10] ^= S[ 0]; S[19] ^= S[ 0]; S[27] ^= S[ 0]; - mROR9; - SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]); - - S[ 3] = cuda_swab32(S[3]); S[ 4] = cuda_swab32(S[4]^S[ 0]); - - const uint64_t check = *(uint64_t*)&S[ 3]; - if(check <= target){ - uint32_t tmp = atomicExch(&resNonce[0], thread); - if (tmp != UINT32_MAX) - resNonce[1] = tmp; + #pragma unroll 13 + for (int i = 0; i < 13; i++) { + S04 ^= S00; + S09 ^= S00; + S18 ^= S00; + S27 ^= S00; + SUB_ROR9; + SMIX(S00, S01, S02, S03); + S04 ^= S00; + S10 ^= S00; + S18 ^= S00; + S27 ^= S00; + SUB_ROR9; + SMIX(S00, S01, S02, S03); + S04 ^= S00; + S10 ^= S00; + S19 ^= S00; + S27 ^= S00; + SUB_ROR9; + SMIX(S00, S01, S02, S03); + S04 ^= S00; + S10 ^= S00; + S19 ^= S00; + S28 ^= S00; + SUB_ROR8; + SMIX(S00, S01, S02, S03); } + S04 ^= S00; + S09 ^= S00; + S18 ^= S00; + S27 ^= S00; + + Hash[0] = cuda_swab32(S01); + Hash[1] = cuda_swab32(S02); + Hash[2] = cuda_swab32(S03); + Hash[3] = cuda_swab32(S04); + Hash[4] = cuda_swab32(S09); + Hash[5] = cuda_swab32(S10); + Hash[6] = cuda_swab32(S11); + Hash[7] = cuda_swab32(S12); + Hash[8] = cuda_swab32(S18); + Hash[9] = cuda_swab32(S19); + Hash[10] = cuda_swab32(S20); + Hash[11] = cuda_swab32(S21); + Hash[12] = cuda_swab32(S27); + Hash[13] = cuda_swab32(S28); + Hash[14] = cuda_swab32(S29); + Hash[15] = cuda_swab32(S30); + + #pragma unroll 4 + for(int i = 0; i < 4; i++) + AS_UINT4(&pHash[i*2]) = AS_UINT4(&Hash[i*4]); } } -__host__ -void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash){ - - const uint32_t threadsperblock = 256; - - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); +#define texDef(id, texname, texmem, texsource, texsize) { \ + unsigned int *texmem; \ + cudaMalloc(&texmem, texsize); \ + d_textures[thr_id][id] = texmem; \ + cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \ + texname.normalized = 0; \ + texname.filterMode = cudaFilterModePoint; \ + texname.addressMode[0] = cudaAddressModeClamp; \ + { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); \ + cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \ + } \ +} - x13_fugue512_gpu_hash_64<<>>(threads, (uint64_t*)d_hash); +__host__ +void x13_fugue512_cpu_init(int thr_id, uint32_t threads) +{ + texDef(0, mixTab0Tex, mixTab0m, mixtab0, sizeof(uint32_t)*256); } __host__ -void x13_fugue512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target){ +void x13_fugue512_cpu_free(int thr_id) +{ + cudaFree(d_textures[thr_id][0]); +} - const uint32_t threadsperblock = 512; +__host__ +//void fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash) +void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash) +{ + const uint32_t threadsperblock = TPB; - // berechne wie viele Thread Blocks wir brauchen dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - x13_fugue512_gpu_hash_64_final<<>>(threads, d_hash,d_resNonce,target); + x13_fugue512_gpu_hash_64 <<>> (threads, (uint64_t*)d_hash); }