diff --git a/algos.h b/algos.h
index 1a16fb193c..29c6ef09a9 100644
--- a/algos.h
+++ b/algos.h
@@ -24,7 +24,7 @@ enum sha_algos {
ALGO_X11,
ALGO_X11EVO,
ALGO_C11,
- ALGO_SIB,
+ ALGO_POLY,
ALGO_X13,
ALGO_X14,
ALGO_X15,
@@ -61,7 +61,7 @@ static const char *algo_names[] = {
"x11",
"x11evo",
"c11",
- "sib",
+ "poly",
"x13",
"x14",
"x15",
diff --git a/bench.cpp b/bench.cpp
index e3926b0a6a..e6696bce3a 100644
--- a/bench.cpp
+++ b/bench.cpp
@@ -60,7 +60,7 @@ void algo_free_all(int thr_id){
free_x11(thr_id);
free_x11evo(thr_id);
free_c11(thr_id);
- free_sib(thr_id);
+ free_poly(thr_id);
free_x13(thr_id);
free_x14(thr_id);
free_x15(thr_id);
diff --git a/ccminer.cpp b/ccminer.cpp
index cd9e7964b5..ac16cbb299 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -229,7 +229,7 @@ Options:\n\
qubit Qubit\n\
x11 X11 (DarkCoin)\n\
c11 C11 (Chaincoin)\n\
- sib X11+gost (Sibcoin)\n\
+ poly Veltor+stuff (Polytimos)\n\
x11evo Permuted x11 (Revolver)\n\
x13 X13 (MaruCoin)\n\
x14 X14 (BernCoin)\n\
@@ -1928,7 +1928,7 @@ static void *miner_thread(void *userdata)
minmax = 0x8000000;
break;
case ALGO_NEOSCRYPT:
- case ALGO_SIB:
+ case ALGO_POLY:
case ALGO_VELTOR:
case ALGO_LYRA2:
minmax = 0x80000;
@@ -2063,8 +2063,8 @@ static void *miner_thread(void *userdata)
case ALGO_HSR:
rc = scanhash_hsr(thr_id, &work, max_nonce, &hashes_done);
break;
- case ALGO_SIB:
- rc = scanhash_sib(thr_id, &work, max_nonce, &hashes_done);
+ case ALGO_POLY:
+ rc = scanhash_poly(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_VELTOR:
rc = scanhash_veltor(thr_id, &work, max_nonce, &hashes_done);
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 10a006d03d..ceb7cf8f24 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -414,6 +414,7 @@
72
+
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index a3d68641da..15f4f287e6 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -628,6 +628,9 @@
Source Files\CUDA\x13
+
+ Source Files\CUDA\x13
+
Source Files\CUDA\quark
diff --git a/miner.h b/miner.h
index 7624acb2e1..34ce49cba2 100644
--- a/miner.h
+++ b/miner.h
@@ -282,7 +282,7 @@ extern int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce, uns
extern int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_x11evo(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
-extern int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_poly(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_x14(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_x15(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@@ -322,7 +322,7 @@ extern void free_whirl(int thr_id);
extern void free_x11(int thr_id);
extern void free_x11evo(int thr_id);
extern void free_c11(int thr_id);
-extern void free_sib(int thr_id);
+extern void free_poly(int thr_id);
extern void free_x13(int thr_id);
extern void free_x14(int thr_id);
extern void free_x15(int thr_id);
@@ -805,7 +805,7 @@ void wcoinhash(void *state, const void *input);
void x11hash(void *output, const void *input);
void x11evo_hash(void *output, const void *input);
void c11hash(void *output, const void *input);
-void sibhash(void *output, const void *input);
+void polyhash(void *output, const void *input);
void x13hash(void *output, const void *input);
void x14hash(void *output, const void *input);
void x15hash(void *output, const void *input);
diff --git a/quark/cuda_quark_skein512.cu b/quark/cuda_quark_skein512.cu
index 5db1873e61..b21ce5a8c7 100644
--- a/quark/cuda_quark_skein512.cu
+++ b/quark/cuda_quark_skein512.cu
@@ -420,8 +420,150 @@ void skein512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *outp
}
}
-
__host__
+void skein512_cpu_setBlock_80( void *pdata)
+{
+ uint64_t message[20];
+ memcpy(&message[0], pdata, 80);
+
+ uint64_t p[8];
+ uint64_t h[9];
+ uint64_t t0, t1, t2;
+
+ h[0] = 0x4903ADFF749C51CEull;
+ h[1] = 0x0D95DE399746DF03ull;
+ h[2] = 0x8FD1934127C79BCEull;
+ h[3] = 0x9A255629FF352CB1ull;
+ h[4] = 0x5DB62599DF6CA7B0ull;
+ h[5] = 0xEABE394CA9D5C3F4ull;
+ h[6] = 0x991112C71A75B523ull;
+ h[7] = 0xAE18A40B660FCC33ull;
+ // h[8] = h[0] ^ h[1] ^ h[2] ^ h[3] ^ h[4] ^ h[5] ^ h[6] ^ h[7] ^ SPH_C64(0x1BD11BDAA9FC1A22);
+ h[8] = 0xcab2076d98173ec4ULL;
+
+ t0 = 64; // ptr
+ t1 = 0x7000000000000000ull;
+ t2 = 0x7000000000000040ull;
+
+ memcpy(&p[0], &message[0], 64);
+
+ TFBIG_4e_PRE(0);
+ TFBIG_4o_PRE(1);
+ TFBIG_4e_PRE(2);
+ TFBIG_4o_PRE(3);
+ TFBIG_4e_PRE(4);
+ TFBIG_4o_PRE(5);
+ TFBIG_4e_PRE(6);
+ TFBIG_4o_PRE(7);
+ TFBIG_4e_PRE(8);
+ TFBIG_4o_PRE(9);
+ TFBIG_4e_PRE(10);
+ TFBIG_4o_PRE(11);
+ TFBIG_4e_PRE(12);
+ TFBIG_4o_PRE(13);
+ TFBIG_4e_PRE(14);
+ TFBIG_4o_PRE(15);
+ TFBIG_4e_PRE(16);
+ TFBIG_4o_PRE(17);
+ TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
+ message[10] = message[0] ^ p[0];
+ message[11] = message[1] ^ p[1];
+ message[12] = message[2] ^ p[2];
+ message[13] = message[3] ^ p[3];
+ message[14] = message[4] ^ p[4];
+ message[15] = message[5] ^ p[5];
+ message[16] = message[6] ^ p[6];
+ message[17] = message[7] ^ p[7];
+ message[18] = t2;
+
+ uint64_t buffer[128];
+
+// buffer[ 0] = message[ 8];
+ buffer[ 0] = message[ 9];
+ h[0] = buffer[ 1] = message[10];
+ h[1] = buffer[ 2] = message[11];
+ h[2] = buffer[ 3] = message[12];
+ h[3] = buffer[ 4] = message[13];
+ h[4] = buffer[ 5] = message[14];
+ h[5] = buffer[ 6] = message[15];
+ h[6] = buffer[ 7] = message[16];
+ h[7] = buffer[ 8] = message[17];
+ h[8] = buffer[ 9] = h[0]^h[1]^h[2]^h[3]^h[4]^h[5]^h[6]^h[7]^0x1BD11BDAA9FC1A22ULL;
+
+ t0 = 0x50ull;
+ t1 = 0xB000000000000000ull;
+ t2 = t0^t1;
+
+ p[0] = message[ 8] + h[0];
+ p[2] = h[2]; p[3] = h[3]; p[4] = h[4];
+ p[5] = h[5] + t0;
+ p[6] = h[6] + t1;
+ p[7] = h[7];
+
+ p[2] += p[3];
+ p[4] += p[5]; p[6] += p[7];
+
+ p[3] = ROTL64(p[3], 36) ^ p[2];
+ p[5] = ROTL64(p[5], 19) ^ p[4];
+ p[7] = ROTL64(p[7], 37) ^ p[6];
+ p[4] += p[7]; p[6] += p[5];
+
+ p[7] = ROTL64(p[7], 27) ^ p[4];
+ p[5] = ROTL64(p[5], 14) ^ p[6];
+ buffer[10] = p[ 0];
+ buffer[11] = p[ 2];
+ buffer[12] = p[ 3];
+ buffer[13] = p[ 4];
+ buffer[14] = p[ 5];
+ buffer[15] = p[ 6];
+ buffer[16] = p[ 7];
+ buffer[17] = ROTL64(p[3], 42);
+ buffer[18] = ROTL64(p[5], 36);
+ buffer[19] = ROTL64(p[7], 39);
+
+ buffer[20] = h[6]+t1;
+ buffer[21] = h[8]+1;
+ buffer[22] = h[7]+t2;
+ buffer[23] = h[0]+2;
+ buffer[24] = h[8]+t0;
+ buffer[25] = h[1]+3;
+ buffer[26] = h[0]+t1;
+ buffer[27] = h[2]+4;
+ buffer[28] = h[1]+t2;
+ buffer[29] = h[3]+5;
+ buffer[30] = h[2]+t0;
+ buffer[31] = h[4]+6;
+ buffer[32] = h[3]+t1;
+ buffer[33] = h[5]+7;
+ buffer[34] = h[4]+t2;
+ buffer[35] = h[6]+8;
+ buffer[36] = h[5]+t0;
+ buffer[37] = h[7]+9;
+ buffer[38] = h[6]+t1;
+ buffer[39] = h[8]+10;
+ buffer[40] = h[7]+t2;
+ buffer[41] = h[0]+11;
+ buffer[42] = h[8]+t0;
+ buffer[43] = h[1]+12;
+ buffer[44] = h[0]+t1;
+ buffer[45] = h[2]+13;
+ buffer[46] = h[1]+t2;
+ buffer[47] = h[3]+14;
+ buffer[48] = h[2]+t0;
+ buffer[49] = h[4]+15;
+ buffer[50] = h[3]+t1;
+ buffer[51] = h[5]+16;
+ buffer[52] = h[4]+t2;
+ buffer[53] = h[6]+17;
+ buffer[54] = h[5]+t0;
+ buffer[55] = h[7]+18;
+ buffer[56] = h[6]+t1;
+
+ buffer[57] = message[ 8];
+
+ cudaMemcpyToSymbol(c_buffer, buffer, sizeof(c_buffer), 0, cudaMemcpyHostToDevice);
+}
+/*
void skein512_cpu_setBlock_80(void *pdata)
{
uint64_t message[20];
@@ -564,7 +706,7 @@ void skein512_cpu_setBlock_80(void *pdata)
CUDA_SAFE_CALL(cudaGetLastError());
}
-
+*/
__host__
void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *g_hash)
{
diff --git a/quark/skein_header.h b/quark/skein_header.h
new file mode 100644
index 0000000000..460b311f57
--- /dev/null
+++ b/quark/skein_header.h
@@ -0,0 +1,385 @@
+/* Elementary defines for SKEIN */
+
+/*
+ * M9_ ## s ## _ ## i evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7).
+ */
+
+#define M9_0_0 0
+#define M9_0_1 1
+#define M9_0_2 2
+#define M9_0_3 3
+#define M9_0_4 4
+#define M9_0_5 5
+#define M9_0_6 6
+#define M9_0_7 7
+
+#define M9_1_0 1
+#define M9_1_1 2
+#define M9_1_2 3
+#define M9_1_3 4
+#define M9_1_4 5
+#define M9_1_5 6
+#define M9_1_6 7
+#define M9_1_7 8
+
+#define M9_2_0 2
+#define M9_2_1 3
+#define M9_2_2 4
+#define M9_2_3 5
+#define M9_2_4 6
+#define M9_2_5 7
+#define M9_2_6 8
+#define M9_2_7 0
+
+#define M9_3_0 3
+#define M9_3_1 4
+#define M9_3_2 5
+#define M9_3_3 6
+#define M9_3_4 7
+#define M9_3_5 8
+#define M9_3_6 0
+#define M9_3_7 1
+
+#define M9_4_0 4
+#define M9_4_1 5
+#define M9_4_2 6
+#define M9_4_3 7
+#define M9_4_4 8
+#define M9_4_5 0
+#define M9_4_6 1
+#define M9_4_7 2
+
+#define M9_5_0 5
+#define M9_5_1 6
+#define M9_5_2 7
+#define M9_5_3 8
+#define M9_5_4 0
+#define M9_5_5 1
+#define M9_5_6 2
+#define M9_5_7 3
+
+#define M9_6_0 6
+#define M9_6_1 7
+#define M9_6_2 8
+#define M9_6_3 0
+#define M9_6_4 1
+#define M9_6_5 2
+#define M9_6_6 3
+#define M9_6_7 4
+
+#define M9_7_0 7
+#define M9_7_1 8
+#define M9_7_2 0
+#define M9_7_3 1
+#define M9_7_4 2
+#define M9_7_5 3
+#define M9_7_6 4
+#define M9_7_7 5
+
+#define M9_8_0 8
+#define M9_8_1 0
+#define M9_8_2 1
+#define M9_8_3 2
+#define M9_8_4 3
+#define M9_8_5 4
+#define M9_8_6 5
+#define M9_8_7 6
+
+#define M9_9_0 0
+#define M9_9_1 1
+#define M9_9_2 2
+#define M9_9_3 3
+#define M9_9_4 4
+#define M9_9_5 5
+#define M9_9_6 6
+#define M9_9_7 7
+
+#define M9_10_0 1
+#define M9_10_1 2
+#define M9_10_2 3
+#define M9_10_3 4
+#define M9_10_4 5
+#define M9_10_5 6
+#define M9_10_6 7
+#define M9_10_7 8
+
+#define M9_11_0 2
+#define M9_11_1 3
+#define M9_11_2 4
+#define M9_11_3 5
+#define M9_11_4 6
+#define M9_11_5 7
+#define M9_11_6 8
+#define M9_11_7 0
+
+#define M9_12_0 3
+#define M9_12_1 4
+#define M9_12_2 5
+#define M9_12_3 6
+#define M9_12_4 7
+#define M9_12_5 8
+#define M9_12_6 0
+#define M9_12_7 1
+
+#define M9_13_0 4
+#define M9_13_1 5
+#define M9_13_2 6
+#define M9_13_3 7
+#define M9_13_4 8
+#define M9_13_5 0
+#define M9_13_6 1
+#define M9_13_7 2
+
+#define M9_14_0 5
+#define M9_14_1 6
+#define M9_14_2 7
+#define M9_14_3 8
+#define M9_14_4 0
+#define M9_14_5 1
+#define M9_14_6 2
+#define M9_14_7 3
+
+#define M9_15_0 6
+#define M9_15_1 7
+#define M9_15_2 8
+#define M9_15_3 0
+#define M9_15_4 1
+#define M9_15_5 2
+#define M9_15_6 3
+#define M9_15_7 4
+
+#define M9_16_0 7
+#define M9_16_1 8
+#define M9_16_2 0
+#define M9_16_3 1
+#define M9_16_4 2
+#define M9_16_5 3
+#define M9_16_6 4
+#define M9_16_7 5
+
+#define M9_17_0 8
+#define M9_17_1 0
+#define M9_17_2 1
+#define M9_17_3 2
+#define M9_17_4 3
+#define M9_17_5 4
+#define M9_17_6 5
+#define M9_17_7 6
+
+#define M9_18_0 0
+#define M9_18_1 1
+#define M9_18_2 2
+#define M9_18_3 3
+#define M9_18_4 4
+#define M9_18_5 5
+#define M9_18_6 6
+#define M9_18_7 7
+
+/*
+ * M3_ ## s ## _ ## i evaluates to s+i mod 3 (0 <= s <= 18, 0 <= i <= 1).
+ */
+
+#define M3_0_0 0
+#define M3_0_1 1
+#define M3_1_0 1
+#define M3_1_1 2
+#define M3_2_0 2
+#define M3_2_1 0
+#define M3_3_0 0
+#define M3_3_1 1
+#define M3_4_0 1
+#define M3_4_1 2
+#define M3_5_0 2
+#define M3_5_1 0
+#define M3_6_0 0
+#define M3_6_1 1
+#define M3_7_0 1
+#define M3_7_1 2
+#define M3_8_0 2
+#define M3_8_1 0
+#define M3_9_0 0
+#define M3_9_1 1
+#define M3_10_0 1
+#define M3_10_1 2
+#define M3_11_0 2
+#define M3_11_1 0
+#define M3_12_0 0
+#define M3_12_1 1
+#define M3_13_0 1
+#define M3_13_1 2
+#define M3_14_0 2
+#define M3_14_1 0
+#define M3_15_0 0
+#define M3_15_1 1
+#define M3_16_0 1
+#define M3_16_1 2
+#define M3_17_0 2
+#define M3_17_1 0
+#define M3_18_0 0
+#define M3_18_1 1
+
+#define XCAT(x, y) XCAT_(x, y)
+#define XCAT_(x, y) x ## y
+
+#define SKBI(k, s, i) XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i))
+#define SKBT(t, s, v) XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v))
+
+#define TFBIG_ADDKEY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \
+ w0 = (w0 + SKBI(k, s, 0)); \
+ w1 = (w1 + SKBI(k, s, 1)); \
+ w2 = (w2 + SKBI(k, s, 2)); \
+ w3 = (w3 + SKBI(k, s, 3)); \
+ w4 = (w4 + SKBI(k, s, 4)); \
+ w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \
+ w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \
+ w7 = (w7 + SKBI(k, s, 7) + make_uint2(s,0); \
+ }
+
+#define TFBIG_MIX(x0, x1, rc) { \
+ x0 = x0 + x1; \
+ x1 = ROL2(x1, rc) ^ x0; \
+ }
+
+#define TFBIG_MIX8(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \
+ TFBIG_MIX(w0, w1, rc0); \
+ TFBIG_MIX(w2, w3, rc1); \
+ TFBIG_MIX(w4, w5, rc2); \
+ TFBIG_MIX(w6, w7, rc3); \
+ }
+
+#define TFBIG_4e(s) { \
+ TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+ TFBIG_MIX8(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \
+ TFBIG_MIX8(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \
+ TFBIG_MIX8(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \
+ TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44, 9, 54, 56); \
+ }
+
+#define TFBIG_4o(s) { \
+ TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+ TFBIG_MIX8(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \
+ TFBIG_MIX8(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \
+ TFBIG_MIX8(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \
+ TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 8, 35, 56, 22); \
+ }
+
+#define TFBIG_KINIT_UI2(k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2) { \
+ k8 = ((k0 ^ k1) ^ (k2 ^ k3)) ^ ((k4 ^ k5) ^ (k6 ^ k7)) \
+ ^ vectorize(0x1BD11BDAA9FC1A22); \
+ t2 = t0 ^ t1; \
+ }
+
+#define TFBIG_ADDKEY_UI2(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \
+ w0 = (w0 + SKBI(k, s, 0)); \
+ w1 = (w1 + SKBI(k, s, 1)); \
+ w2 = (w2 + SKBI(k, s, 2)); \
+ w3 = (w3 + SKBI(k, s, 3)); \
+ w4 = (w4 + SKBI(k, s, 4)); \
+ w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \
+ w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \
+ w7 = (w7 + SKBI(k, s, 7) + vectorize(s)); \
+ }
+
+#define TFBIG_ADDKEY_PRE(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \
+ w0 = (w0 + SKBI(k, s, 0)); \
+ w1 = (w1 + SKBI(k, s, 1)); \
+ w2 = (w2 + SKBI(k, s, 2)); \
+ w3 = (w3 + SKBI(k, s, 3)); \
+ w4 = (w4 + SKBI(k, s, 4)); \
+ w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \
+ w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \
+ w7 = (w7 + SKBI(k, s, 7) + (s)); \
+ }
+
+#define TFBIG_MIX_UI2(x0, x1, rc) { \
+ x0 = x0 + x1; \
+ x1 = ROL2(x1, rc) ^ x0; \
+ }
+
+#define TFBIG_MIX_PRE(x0, x1, rc) { \
+ x0 = x0 + x1; \
+ x1 = ROTL64(x1, rc) ^ x0; \
+ }
+
+#define TFBIG_MIX8_UI2(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \
+ TFBIG_MIX_UI2(w0, w1, rc0); \
+ TFBIG_MIX_UI2(w2, w3, rc1); \
+ TFBIG_MIX_UI2(w4, w5, rc2); \
+ TFBIG_MIX_UI2(w6, w7, rc3); \
+ }
+
+#define TFBIG_MIX8_PRE(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \
+ TFBIG_MIX_PRE(w0, w1, rc0); \
+ TFBIG_MIX_PRE(w2, w3, rc1); \
+ TFBIG_MIX_PRE(w4, w5, rc2); \
+ TFBIG_MIX_PRE(w6, w7, rc3); \
+ }
+
+#define TFBIG_4e_UI2(s) { \
+ TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+ TFBIG_MIX8_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \
+ TFBIG_MIX8_UI2(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \
+ TFBIG_MIX8_UI2(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \
+ TFBIG_MIX8_UI2(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44, 9, 54, 56); \
+ }
+
+#define TFBIG_4e_PRE(s) { \
+ TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+ TFBIG_MIX8_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \
+ TFBIG_MIX8_PRE(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \
+ TFBIG_MIX8_PRE(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \
+ TFBIG_MIX8_PRE(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44, 9, 54, 56); \
+ }
+
+#define TFBIG_4o_UI2(s) { \
+ TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+ TFBIG_MIX8_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \
+ TFBIG_MIX8_UI2(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \
+ TFBIG_MIX8_UI2(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \
+ TFBIG_MIX8_UI2(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 8, 35, 56, 22); \
+ }
+
+#define TFBIG_4o_PRE(s) { \
+ TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+ TFBIG_MIX8_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \
+ TFBIG_MIX8_PRE(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \
+ TFBIG_MIX8_PRE(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \
+ TFBIG_MIX8_PRE(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 8, 35, 56, 22); \
+ }
+
+#define TFBIGMIX8e(){\
+ p[ 0]+=p[ 1];p[ 2]+=p[ 3];p[ 4]+=p[ 5];p[ 6]+=p[ 7];p[ 1]=ROL2(p[ 1],46) ^ p[ 0];p[ 3]=ROL2(p[ 3],36) ^ p[ 2];p[ 5]=ROL2(p[ 5],19) ^ p[ 4];p[ 7]=ROL2(p[ 7],37) ^ p[ 6];\
+ p[ 2]+=p[ 1];p[ 4]+=p[ 7];p[ 6]+=p[ 5];p[ 0]+=p[ 3];p[ 1]=ROL2(p[ 1],33) ^ p[ 2];p[ 7]=ROL2(p[ 7],27) ^ p[ 4];p[ 5]=ROL2(p[ 5],14) ^ p[ 6];p[ 3]=ROL2(p[ 3],42) ^ p[ 0];\
+ p[ 4]+=p[ 1];p[ 6]+=p[ 3];p[ 0]+=p[ 5];p[ 2]+=p[ 7];p[ 1]=ROL2(p[ 1],17) ^ p[ 4];p[ 3]=ROL2(p[ 3],49) ^ p[ 6];p[ 5]=ROL2(p[ 5],36) ^ p[ 0];p[ 7]=ROL2(p[ 7],39) ^ p[ 2];\
+ p[ 6]+=p[ 1];p[ 0]+=p[ 7];p[ 2]+=p[ 5];p[ 4]+=p[ 3];p[ 1]=ROL2(p[ 1],44) ^ p[ 6];p[ 7]=ROL2(p[ 7], 9) ^ p[ 0];p[ 5]=ROL2(p[ 5],54) ^ p[ 2];p[ 3]=ROR8(p[ 3]) ^ p[ 4];\
+}
+#define TFBIGMIX8o(){\
+ p[ 0]+=p[ 1];p[ 2]+=p[ 3];p[ 4]+=p[ 5];p[ 6]+=p[ 7];p[ 1]=ROL2(p[ 1],39) ^ p[ 0];p[ 3]=ROL2(p[ 3],30) ^ p[ 2];p[ 5]=ROL2(p[ 5],34) ^ p[ 4];p[ 7]=ROL24(p[ 7]) ^ p[ 6];\
+ p[ 2]+=p[ 1];p[ 4]+=p[ 7];p[ 6]+=p[ 5];p[ 0]+=p[ 3];p[ 1]=ROL2(p[ 1],13) ^ p[ 2];p[ 7]=ROL2(p[ 7],50) ^ p[ 4];p[ 5]=ROL2(p[ 5],10) ^ p[ 6];p[ 3]=ROL2(p[ 3],17) ^ p[ 0];\
+ p[ 4]+=p[ 1];p[ 6]+=p[ 3];p[ 0]+=p[ 5];p[ 2]+=p[ 7];p[ 1]=ROL2(p[ 1],25) ^ p[ 4];p[ 3]=ROL2(p[ 3],29) ^ p[ 6];p[ 5]=ROL2(p[ 5],39) ^ p[ 0];p[ 7]=ROL2(p[ 7],43) ^ p[ 2];\
+ p[ 6]+=p[ 1];p[ 0]+=p[ 7];p[ 2]+=p[ 5];p[ 4]+=p[ 3];p[ 1]=ROL8(p[ 1]) ^ p[ 6];p[ 7]=ROL2(p[ 7],35) ^ p[ 0];p[ 5]=ROR8(p[ 5]) ^ p[ 2];p[ 3]=ROL2(p[ 3],22) ^ p[ 4];\
+}
+
+#define addwBuff(x0,x1,x2,x3,x4){\
+ p[ 0]+=h[x0];\
+ p[ 1]+=h[x1];\
+ p[ 2]+=h[x2];\
+ p[ 3]+=h[x3];\
+ p[ 4]+=h[x4];\
+ p[ 5]+=c_buffer[i++];\
+ p[ 7]+=c_buffer[i++];\
+ p[ 6]+=c_buffer[i];\
+}
+
+#define addwCon(x0,x1,x2,x3,x4,x5,x6,x7,y0,y1,y2){\
+ p[ 0]+= h[x0];\
+ p[ 1]+= h[x1];\
+ p[ 2]+= h[x2];\
+ p[ 3]+= h[x3];\
+ p[ 4]+= h[x4];\
+ p[ 5]+= h[x5] + c_t[y0];\
+ p[ 6]+= h[x6] + c_t[y1];\
+ p[ 7]+= h[x7] + c_add[y2];\
+}
+
+
diff --git a/streebog/sib.cu b/streebog/sib.cu
index 24de940b7f..31176d1523 100644
--- a/streebog/sib.cu
+++ b/streebog/sib.cu
@@ -1,15 +1,9 @@
extern "C" {
-#include "sph/sph_blake.h"
-#include "sph/sph_bmw.h"
-#include "sph/sph_groestl.h"
#include "sph/sph_skein.h"
-#include "sph/sph_jh.h"
-#include "sph/sph_keccak.h"
#include "sph/sph_luffa.h"
-#include "sph/sph_cubehash.h"
-#include "sph/sph_shavite.h"
-#include "sph/sph_simd.h"
#include "sph/sph_echo.h"
+#include "sph/sph_fugue.h"
+#include "sph/sph_shabal.h"
#include "sph/sph_streebog.h"
}
@@ -17,94 +11,76 @@ extern "C" {
#include "cuda_helper.h"
#include "x11/cuda_x11.h"
-extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
-extern void keccak_streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
-extern void keccak_streebog_luffa_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
-
#include
#include
#define NBN 2
+
static uint32_t *d_hash[MAX_GPUS];
static uint32_t *d_resNonce[MAX_GPUS];
static uint32_t *h_resNonce[MAX_GPUS];
+extern void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash,uint32_t* d_resNonce);
+extern void streebog_set_target(const uint32_t* ptarget);
+
+extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
+//extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads,uint32_t *d_hash);
+extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
+extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
+
+extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads,uint32_t *d_hash);
+extern void x11_luffa512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
+
+extern void skein512_cpu_setBlock_80(void *pdata);
+extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+
// Sibcoin CPU Hash
-extern "C" void sibhash(void *output, const void *input)
+extern "C" void polyhash(void *output, const void *input)
{
unsigned char _ALIGN(128) hash[128] = { 0 };
- sph_blake512_context ctx_blake;
- sph_bmw512_context ctx_bmw;
- sph_groestl512_context ctx_groestl;
sph_skein512_context ctx_skein;
- sph_jh512_context ctx_jh;
- sph_keccak512_context ctx_keccak;
- sph_gost512_context ctx_gost;
- sph_luffa512_context ctx_luffa;
- sph_cubehash512_context ctx_cubehash;
- sph_shavite512_context ctx_shavite;
- sph_simd512_context ctx_simd;
+ sph_shabal512_context ctx_shabal;
sph_echo512_context ctx_echo;
+ sph_luffa512_context ctx_luffa;
+ sph_fugue512_context ctx_fugue;
+ sph_gost512_context ctx_gost;
- sph_blake512_init(&ctx_blake);
- sph_blake512 (&ctx_blake, input, 80);
- sph_blake512_close(&ctx_blake, (void*) hash);
- sph_bmw512_init(&ctx_bmw);
- sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
- sph_bmw512_close(&ctx_bmw, (void*) hash);
+ sph_skein512_init(&ctx_skein);
+ sph_skein512 (&ctx_skein, input, 80);
+ sph_skein512_close(&ctx_skein, (void*) hash);
- sph_groestl512_init(&ctx_groestl);
- sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
- sph_groestl512_close(&ctx_groestl, (void*) hash);
+ sph_shabal512_init(&ctx_shabal);
+ sph_shabal512 (&ctx_shabal, (const void*) hash, 64);
+ sph_shabal512_close(&ctx_shabal, (void*) hash);
- sph_skein512_init(&ctx_skein);
- sph_skein512 (&ctx_skein, (const void*) hash, 64);
- sph_skein512_close(&ctx_skein, (void*) hash);
+ sph_echo512_init(&ctx_echo);
+ sph_echo512 (&ctx_echo, (const void*) hash, 64);
+ sph_echo512_close(&ctx_echo, (void*) hash);
- sph_jh512_init(&ctx_jh);
- sph_jh512 (&ctx_jh, (const void*) hash, 64);
- sph_jh512_close(&ctx_jh, (void*) hash);
+ sph_luffa512_init(&ctx_luffa);
+ sph_luffa512 (&ctx_luffa, (const void*) hash, 64);
+ sph_luffa512_close (&ctx_luffa, (void*) hash);
- sph_keccak512_init(&ctx_keccak);
- sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
- sph_keccak512_close(&ctx_keccak, (void*) hash);
+ sph_fugue512_init(&ctx_fugue);
+ sph_fugue512(&ctx_fugue, (const void*) hash, 64);
+ sph_fugue512_close(&ctx_fugue, (void*) hash);
sph_gost512_init(&ctx_gost);
sph_gost512(&ctx_gost, (const void*) hash, 64);
sph_gost512_close(&ctx_gost, (void*) hash);
- sph_luffa512_init(&ctx_luffa);
- sph_luffa512 (&ctx_luffa, (const void*) hash, 64);
- sph_luffa512_close (&ctx_luffa, (void*) hash);
-
- sph_cubehash512_init(&ctx_cubehash);
- sph_cubehash512 (&ctx_cubehash, (const void*) hash, 64);
- sph_cubehash512_close(&ctx_cubehash, (void*) hash);
-
- sph_shavite512_init(&ctx_shavite);
- sph_shavite512 (&ctx_shavite, (const void*) hash, 64);
- sph_shavite512_close(&ctx_shavite, (void*) hash);
-
- sph_simd512_init(&ctx_simd);
- sph_simd512 (&ctx_simd, (const void*) hash, 64);
- sph_simd512_close(&ctx_simd, (void*) hash);
-
- sph_echo512_init(&ctx_echo);
- sph_echo512 (&ctx_echo, (const void*) hash, 64);
- sph_echo512_close(&ctx_echo, (void*) hash);
-
memcpy(output, hash, 32);
}
//#define _DEBUG
-#define _DEBUG_PREFIX "sib"
+#define _DEBUG_PREFIX "poly"
#include "cuda_debug.cuh"
static bool init[MAX_GPUS] = { 0 };
-extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+extern "C" int scanhash_poly(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
int dev_id = device_map[thr_id];
@@ -122,10 +98,8 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
uint32_t throughput = cuda_default_throughput(thr_id, default_throughput); // 19=256*256*8;
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
- throughput&=0xFFFFFF70; //multiples of 128 due to keccak_streebog_luffa
-
- if (opt_benchmark)
- ptarget[7] = 0xf;
+ if (opt_benchmark)
+ ptarget[7] = 0xf;
if (!init[thr_id])
{
@@ -139,8 +113,6 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
gpulog(LOG_INFO,thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
- x11_simd_echo_512_cpu_init(thr_id, throughput);
-
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 8 * sizeof(uint64_t) * throughput));
CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t)));
h_resNonce[thr_id] = (uint32_t*) malloc(NBN * sizeof(uint32_t));
@@ -148,34 +120,28 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
gpulog(LOG_ERR,thr_id,"Host memory allocation failed");
exit(EXIT_FAILURE);
}
-
+ x13_fugue512_cpu_init(thr_id, throughput);
init[thr_id] = true;
}
-
+
uint32_t endiandata[20];
for (int k=0; k < 20; k++)
be32enc(&endiandata[k], pdata[k]);
- quark_blake512_cpu_setBlock_80(thr_id, endiandata);
-
+ skein512_cpu_setBlock_80(endiandata);
cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
+ streebog_set_target(ptarget);
do {
// Hash with CUDA
- quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
- quark_bmw512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]);
- quark_groestl512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]);
- quark_skein512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]);
- quark_jh512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]);
-
-// quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]);
-// streebog_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
- keccak_streebog_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+
+ skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
+ x14_shabal512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+ x11_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
x11_luffa512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
- x11_cubehash_shavite512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
-// x11_luffaCubehashShavite512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
- x11_simd_echo512_cpu_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id], *(uint64_t*)&ptarget[6]);
+ x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id]);
+ streebog_cpu_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost);
@@ -184,9 +150,9 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
const uint32_t startNounce = pdata[19];
uint32_t vhash64[8];
be32enc(&endiandata[19], startNounce + h_resNonce[thr_id][0]);
- sibhash(vhash64, endiandata);
+ polyhash(vhash64, endiandata);
- if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+ if (vhash64[7] <= Htarg) {
int res = 1;
*hashes_done = pdata[19] - first_nonce + throughput;
work_set_target_ratio(work, vhash64);
@@ -196,7 +162,7 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
// if(!opt_quiet)
// gpulog(LOG_BLUE,dev_id,"Found 2nd nonce: %08x", pdata[21]);
be32enc(&endiandata[19], startNounce+h_resNonce[thr_id][1]);
- sibhash(vhash64, endiandata);
+ polyhash(vhash64, endiandata);
if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio[0]){
work_set_target_ratio(work, vhash64);
xchg(pdata[19],pdata[21]);
@@ -221,7 +187,7 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
}
// cleanup
-extern "C" void free_sib(int thr_id)
+extern "C" void free_poly(int thr_id)
{
if (!init[thr_id])
return;
@@ -231,8 +197,6 @@ extern "C" void free_sib(int thr_id)
free(h_resNonce[thr_id]);
cudaFree(d_resNonce[thr_id]);
cudaFree(d_hash[thr_id]);
-
- x11_simd_echo_512_cpu_free(thr_id);
init[thr_id] = false;
diff --git a/streebog/veltor.cu b/streebog/veltor.cu
index c288b4f3d7..ea1bd1c76d 100644
--- a/streebog/veltor.cu
+++ b/streebog/veltor.cu
@@ -95,14 +95,14 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce
init[thr_id] = true;
}
- uint32_t endiandata[20];
- for (int k=0; k < 20; k++)
- be32enc(&endiandata[k], pdata[k]);
+ uint32_t endiandata[20];
+ for (int k=0; k < 20; k++)
+ be32enc(&endiandata[k], pdata[k]);
- skein512_cpu_setBlock_80(endiandata);
+ skein512_cpu_setBlock_80(endiandata);
- cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
- streebog_set_target(ptarget);
+ cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
+ streebog_set_target(ptarget);
do {
skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
diff --git a/util.cpp b/util.cpp
index 146e89d9a1..5ef1ff358a 100644
--- a/util.cpp
+++ b/util.cpp
@@ -2182,7 +2182,7 @@ void print_hash_tests(void){
c11hash(&hash[0], &buf[0]);
printpfx("c11", hash);
- sibhash(&hash[0], &buf[0]);
+ polyhash(&hash[0], &buf[0]);
printpfx("sib", hash);
x13hash(&hash[0], &buf[0]);
diff --git a/x13/cuda_x13_fugue512.cu b/x13/cuda_x13_fugue512.cu
index b3d708a226..43bb744a15 100644
--- a/x13/cuda_x13_fugue512.cu
+++ b/x13/cuda_x13_fugue512.cu
@@ -1,32 +1,26 @@
+
+#include
+
+#define TPB 256
+
/*
- * Quick and dirty addition of Fugue-512 for X13
- *
- * Built on cbuchner1's implementation, actual hashing code
- * heavily based on phm's sgminer
- *
- *
- */
-#include "cuda_helper.h"
-#include "miner.h"
-#include "cuda_vectors.h"
-/*
- * X13 kernel implementation.
+ * fugue512 x13 kernel implementation.
*
* ==========================(LICENSE BEGIN)============================
*
- * Copyright (c) 2014-2016 phm, Provos Alexis
- *
+ * Copyright (c) 2014-2017 phm, tpruvot
+ *
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
- * "Software", to deal in the Software without restriction, including
+ * "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
- *
+ *
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
- *
+ *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@@ -36,373 +30,356 @@
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
- *
- * @author phm
- * @author Provos Alexis (Applied partial shared Mem utilization under CUDA 7.5 for compute5.0/5.2 / 2016)
*/
-static __constant__ const uint32_t c_S[16] = {
- 0x8807a57e, 0xe616af75, 0xc5d3e4db, 0xac9ab027,
- 0xd915f117, 0xb6eecc54, 0x06e8020b, 0x4a92efd1,
- 0xaac6e2c9, 0xddb21398, 0xcae65838, 0x437f203f,
- 0x25ea78e7, 0x951fddd6, 0xda6ed11d, 0xe13e3567
+#ifdef __INTELLISENSE__
+#define __byte_perm(x, y, m) (x|y)
+#define tex1Dfetch(t, n) (n)
+#define __CUDACC__
+#include
+#endif
+
+// store allocated textures device addresses
+static unsigned int* d_textures[MAX_GPUS][1];
+
+#define mixtab0(x) mixtabs[(x)]
+#define mixtab1(x) mixtabs[(x)+256]
+#define mixtab2(x) mixtabs[(x)+512]
+#define mixtab3(x) mixtabs[(x)+768]
+
+static texture mixTab0Tex;
+
+static const uint32_t mixtab0[] = {
+ 0x63633297, 0x7c7c6feb, 0x77775ec7, 0x7b7b7af7, 0xf2f2e8e5, 0x6b6b0ab7, 0x6f6f16a7, 0xc5c56d39,
+ 0x303090c0, 0x01010704, 0x67672e87, 0x2b2bd1ac, 0xfefeccd5, 0xd7d71371, 0xabab7c9a, 0x767659c3,
+ 0xcaca4005, 0x8282a33e, 0xc9c94909, 0x7d7d68ef, 0xfafad0c5, 0x5959947f, 0x4747ce07, 0xf0f0e6ed,
+ 0xadad6e82, 0xd4d41a7d, 0xa2a243be, 0xafaf608a, 0x9c9cf946, 0xa4a451a6, 0x727245d3, 0xc0c0762d,
+ 0xb7b728ea, 0xfdfdc5d9, 0x9393d47a, 0x2626f298, 0x363682d8, 0x3f3fbdfc, 0xf7f7f3f1, 0xcccc521d,
+ 0x34348cd0, 0xa5a556a2, 0xe5e58db9, 0xf1f1e1e9, 0x71714cdf, 0xd8d83e4d, 0x313197c4, 0x15156b54,
+ 0x04041c10, 0xc7c76331, 0x2323e98c, 0xc3c37f21, 0x18184860, 0x9696cf6e, 0x05051b14, 0x9a9aeb5e,
+ 0x0707151c, 0x12127e48, 0x8080ad36, 0xe2e298a5, 0xebeba781, 0x2727f59c, 0xb2b233fe, 0x757550cf,
+ 0x09093f24, 0x8383a43a, 0x2c2cc4b0, 0x1a1a4668, 0x1b1b416c, 0x6e6e11a3, 0x5a5a9d73, 0xa0a04db6,
+ 0x5252a553, 0x3b3ba1ec, 0xd6d61475, 0xb3b334fa, 0x2929dfa4, 0xe3e39fa1, 0x2f2fcdbc, 0x8484b126,
+ 0x5353a257, 0xd1d10169, 0x00000000, 0xededb599, 0x2020e080, 0xfcfcc2dd, 0xb1b13af2, 0x5b5b9a77,
+ 0x6a6a0db3, 0xcbcb4701, 0xbebe17ce, 0x3939afe4, 0x4a4aed33, 0x4c4cff2b, 0x5858937b, 0xcfcf5b11,
+ 0xd0d0066d, 0xefefbb91, 0xaaaa7b9e, 0xfbfbd7c1, 0x4343d217, 0x4d4df82f, 0x333399cc, 0x8585b622,
+ 0x4545c00f, 0xf9f9d9c9, 0x02020e08, 0x7f7f66e7, 0x5050ab5b, 0x3c3cb4f0, 0x9f9ff04a, 0xa8a87596,
+ 0x5151ac5f, 0xa3a344ba, 0x4040db1b, 0x8f8f800a, 0x9292d37e, 0x9d9dfe42, 0x3838a8e0, 0xf5f5fdf9,
+ 0xbcbc19c6, 0xb6b62fee, 0xdada3045, 0x2121e784, 0x10107040, 0xffffcbd1, 0xf3f3efe1, 0xd2d20865,
+ 0xcdcd5519, 0x0c0c2430, 0x1313794c, 0xececb29d, 0x5f5f8667, 0x9797c86a, 0x4444c70b, 0x1717655c,
+ 0xc4c46a3d, 0xa7a758aa, 0x7e7e61e3, 0x3d3db3f4, 0x6464278b, 0x5d5d886f, 0x19194f64, 0x737342d7,
+ 0x60603b9b, 0x8181aa32, 0x4f4ff627, 0xdcdc225d, 0x2222ee88, 0x2a2ad6a8, 0x9090dd76, 0x88889516,
+ 0x4646c903, 0xeeeebc95, 0xb8b805d6, 0x14146c50, 0xdede2c55, 0x5e5e8163, 0x0b0b312c, 0xdbdb3741,
+ 0xe0e096ad, 0x32329ec8, 0x3a3aa6e8, 0x0a0a3628, 0x4949e43f, 0x06061218, 0x2424fc90, 0x5c5c8f6b,
+ 0xc2c27825, 0xd3d30f61, 0xacac6986, 0x62623593, 0x9191da72, 0x9595c662, 0xe4e48abd, 0x797974ff,
+ 0xe7e783b1, 0xc8c84e0d, 0x373785dc, 0x6d6d18af, 0x8d8d8e02, 0xd5d51d79, 0x4e4ef123, 0xa9a97292,
+ 0x6c6c1fab, 0x5656b943, 0xf4f4fafd, 0xeaeaa085, 0x6565208f, 0x7a7a7df3, 0xaeae678e, 0x08083820,
+ 0xbaba0bde, 0x787873fb, 0x2525fb94, 0x2e2ecab8, 0x1c1c5470, 0xa6a65fae, 0xb4b421e6, 0xc6c66435,
+ 0xe8e8ae8d, 0xdddd2559, 0x747457cb, 0x1f1f5d7c, 0x4b4bea37, 0xbdbd1ec2, 0x8b8b9c1a, 0x8a8a9b1e,
+ 0x70704bdb, 0x3e3ebaf8, 0xb5b526e2, 0x66662983, 0x4848e33b, 0x0303090c, 0xf6f6f4f5, 0x0e0e2a38,
+ 0x61613c9f, 0x35358bd4, 0x5757be47, 0xb9b902d2, 0x8686bf2e, 0xc1c17129, 0x1d1d5374, 0x9e9ef74e,
+ 0xe1e191a9, 0xf8f8decd, 0x9898e556, 0x11117744, 0x696904bf, 0xd9d93949, 0x8e8e870e, 0x9494c166,
+ 0x9b9bec5a, 0x1e1e5a78, 0x8787b82a, 0xe9e9a989, 0xcece5c15, 0x5555b04f, 0x2828d8a0, 0xdfdf2b51,
+ 0x8c8c8906, 0xa1a14ab2, 0x89899212, 0x0d0d2334, 0xbfbf10ca, 0xe6e684b5, 0x4242d513, 0x686803bb,
+ 0x4141dc1f, 0x9999e252, 0x2d2dc3b4, 0x0f0f2d3c, 0xb0b03df6, 0x5454b74b, 0xbbbb0cda, 0x16166258
};
-static __device__ uint32_t mixtab0[256] = {
- 0x63633297, 0x7c7c6feb, 0x77775ec7, 0x7b7b7af7, 0xf2f2e8e5, 0x6b6b0ab7, 0x6f6f16a7, 0xc5c56d39, 0x303090c0, 0x01010704, 0x67672e87, 0x2b2bd1ac, 0xfefeccd5, 0xd7d71371, 0xabab7c9a,
- 0x767659c3, 0xcaca4005, 0x8282a33e, 0xc9c94909, 0x7d7d68ef, 0xfafad0c5, 0x5959947f, 0x4747ce07, 0xf0f0e6ed, 0xadad6e82, 0xd4d41a7d, 0xa2a243be, 0xafaf608a, 0x9c9cf946, 0xa4a451a6,
- 0x727245d3, 0xc0c0762d, 0xb7b728ea, 0xfdfdc5d9, 0x9393d47a, 0x2626f298, 0x363682d8, 0x3f3fbdfc, 0xf7f7f3f1, 0xcccc521d, 0x34348cd0, 0xa5a556a2, 0xe5e58db9, 0xf1f1e1e9, 0x71714cdf,
- 0xd8d83e4d, 0x313197c4, 0x15156b54, 0x04041c10, 0xc7c76331, 0x2323e98c, 0xc3c37f21, 0x18184860, 0x9696cf6e, 0x05051b14, 0x9a9aeb5e, 0x0707151c, 0x12127e48, 0x8080ad36, 0xe2e298a5,
- 0xebeba781, 0x2727f59c, 0xb2b233fe, 0x757550cf, 0x09093f24, 0x8383a43a, 0x2c2cc4b0, 0x1a1a4668, 0x1b1b416c, 0x6e6e11a3, 0x5a5a9d73, 0xa0a04db6, 0x5252a553, 0x3b3ba1ec, 0xd6d61475,
- 0xb3b334fa, 0x2929dfa4, 0xe3e39fa1, 0x2f2fcdbc, 0x8484b126, 0x5353a257, 0xd1d10169, 0x00000000, 0xededb599, 0x2020e080, 0xfcfcc2dd, 0xb1b13af2, 0x5b5b9a77, 0x6a6a0db3, 0xcbcb4701,
- 0xbebe17ce, 0x3939afe4, 0x4a4aed33, 0x4c4cff2b, 0x5858937b, 0xcfcf5b11, 0xd0d0066d, 0xefefbb91, 0xaaaa7b9e, 0xfbfbd7c1, 0x4343d217, 0x4d4df82f, 0x333399cc, 0x8585b622, 0x4545c00f,
- 0xf9f9d9c9, 0x02020e08, 0x7f7f66e7, 0x5050ab5b, 0x3c3cb4f0, 0x9f9ff04a, 0xa8a87596, 0x5151ac5f, 0xa3a344ba, 0x4040db1b, 0x8f8f800a, 0x9292d37e, 0x9d9dfe42, 0x3838a8e0, 0xf5f5fdf9,
- 0xbcbc19c6, 0xb6b62fee, 0xdada3045, 0x2121e784, 0x10107040, 0xffffcbd1, 0xf3f3efe1, 0xd2d20865, 0xcdcd5519, 0x0c0c2430, 0x1313794c, 0xececb29d, 0x5f5f8667, 0x9797c86a, 0x4444c70b,
- 0x1717655c, 0xc4c46a3d, 0xa7a758aa, 0x7e7e61e3, 0x3d3db3f4, 0x6464278b, 0x5d5d886f, 0x19194f64, 0x737342d7, 0x60603b9b, 0x8181aa32, 0x4f4ff627, 0xdcdc225d, 0x2222ee88, 0x2a2ad6a8,
- 0x9090dd76, 0x88889516, 0x4646c903, 0xeeeebc95, 0xb8b805d6, 0x14146c50, 0xdede2c55, 0x5e5e8163, 0x0b0b312c, 0xdbdb3741, 0xe0e096ad, 0x32329ec8, 0x3a3aa6e8, 0x0a0a3628, 0x4949e43f,
- 0x06061218, 0x2424fc90, 0x5c5c8f6b, 0xc2c27825, 0xd3d30f61, 0xacac6986, 0x62623593, 0x9191da72, 0x9595c662, 0xe4e48abd, 0x797974ff, 0xe7e783b1, 0xc8c84e0d, 0x373785dc, 0x6d6d18af,
- 0x8d8d8e02, 0xd5d51d79, 0x4e4ef123, 0xa9a97292, 0x6c6c1fab, 0x5656b943, 0xf4f4fafd, 0xeaeaa085, 0x6565208f, 0x7a7a7df3, 0xaeae678e, 0x08083820, 0xbaba0bde, 0x787873fb, 0x2525fb94,
- 0x2e2ecab8, 0x1c1c5470, 0xa6a65fae, 0xb4b421e6, 0xc6c66435, 0xe8e8ae8d, 0xdddd2559, 0x747457cb, 0x1f1f5d7c, 0x4b4bea37, 0xbdbd1ec2, 0x8b8b9c1a, 0x8a8a9b1e, 0x70704bdb, 0x3e3ebaf8,
- 0xb5b526e2, 0x66662983, 0x4848e33b, 0x0303090c, 0xf6f6f4f5, 0x0e0e2a38, 0x61613c9f, 0x35358bd4, 0x5757be47, 0xb9b902d2, 0x8686bf2e, 0xc1c17129, 0x1d1d5374, 0x9e9ef74e, 0xe1e191a9,
- 0xf8f8decd, 0x9898e556, 0x11117744, 0x696904bf, 0xd9d93949, 0x8e8e870e, 0x9494c166, 0x9b9bec5a, 0x1e1e5a78, 0x8787b82a, 0xe9e9a989, 0xcece5c15, 0x5555b04f, 0x2828d8a0, 0xdfdf2b51,
- 0x8c8c8906, 0xa1a14ab2, 0x89899212, 0x0d0d2334, 0xbfbf10ca, 0xe6e684b5, 0x4242d513, 0x686803bb, 0x4141dc1f, 0x9999e252, 0x2d2dc3b4, 0x0f0f2d3c, 0xb0b03df6, 0x5454b74b, 0xbbbb0cda,
- 0x16166258
-};
-
-#define mixtab0(x) shared[0][x]
-#define mixtab1(x) shared[1][x]
-#define mixtab2(x) shared[2][x]
-#define mixtab3(x) shared[3][x]
-
#define TIX4(q, x00, x01, x04, x07, x08, x22, x24, x27, x30) { \
- x22 ^= x00; \
- x00 = (q); \
- x08 ^= (q); \
- x01 ^= x24; \
- x04 ^= x27; \
- x07 ^= x30; \
- }
+ x22 ^= x00; \
+ x00 = (q); \
+ x08 ^= x00; \
+ x01 ^= x24; \
+ x04 ^= x27; \
+ x07 ^= x30; \
+}
#define CMIX36(x00, x01, x02, x04, x05, x06, x18, x19, x20) { \
- x00 ^= x04; \
- x01 ^= x05; \
- x02 ^= x06; \
- x18 ^= x04; \
- x19 ^= x05; \
- x20 ^= x06; \
- }
+ x00 ^= x04; \
+ x01 ^= x05; \
+ x02 ^= x06; \
+ x18 ^= x04; \
+ x19 ^= x05; \
+ x20 ^= x06; \
+}
-__device__ __forceinline__
-static void SMIX(const uint32_t shared[4][256], uint32_t &x0,uint32_t &x1,uint32_t &x2,uint32_t &x3){
- uint32_t c0 = mixtab0(__byte_perm(x0,0,0x4443));
- uint32_t r1 = mixtab1(__byte_perm(x0,0,0x4442));
- uint32_t r2 = mixtab2(__byte_perm(x0,0,0x4441));
- uint32_t r3 = mixtab3(__byte_perm(x0,0,0x4440));
- c0 = c0 ^ r1 ^ r2 ^ r3;
- uint32_t r0 = mixtab0(__byte_perm(x1,0,0x4443));
- uint32_t c1 = r0 ^ mixtab1(__byte_perm(x1,0,0x4442));
- uint32_t tmp = mixtab2(__byte_perm(x1,0,0x4441));
- c1 ^= tmp;
- r2 ^= tmp;
- tmp = mixtab3(__byte_perm(x1,0,0x4440));
- c1 ^= tmp;
- r3 ^= tmp;
- uint32_t c2 = mixtab0(__byte_perm(x2,0,0x4443));
- r0 ^= c2;
- tmp = mixtab1(__byte_perm(x2,0,0x4442));
- c2 ^= tmp;
- r1 ^= tmp;
- tmp = mixtab2(__byte_perm(x2,0,0x4441));
- c2 ^= tmp;
- tmp = mixtab3(__byte_perm(x2,0,0x4440));
- c2 ^= tmp;
- r3 ^= tmp;
- uint32_t c3 = mixtab0(__byte_perm(x3,0,0x4443));
- r0 ^= c3;
- tmp = mixtab1(__byte_perm(x3,0,0x4442));
- c3 ^= tmp;
- r1 ^= tmp;
- tmp = mixtab2(__byte_perm(x3,0,0x4441));
- c3 ^= tmp;
- r2 ^= tmp;
- tmp = mixtab3(__byte_perm(x3,0,0x4440));
- c3 ^= tmp;
- x0 = ((c0 ^ (r0 << 0)) & 0xFF000000) | ((c1 ^ (r1 << 0)) & 0x00FF0000) | ((c2 ^ (r2 << 0)) & 0x0000FF00) | ((c3 ^ (r3 << 0)) & 0x000000FF);
- x1 = ((c1 ^ (r0 << 8)) & 0xFF000000) | ((c2 ^ (r1 << 8)) & 0x00FF0000) | ((c3 ^ (r2 << 8)) & 0x0000FF00) | ((c0 ^ (r3 >>24)) & 0x000000FF);
- x2 = ((c2 ^ (r0 <<16)) & 0xFF000000) | ((c3 ^ (r1 <<16)) & 0x00FF0000) | ((c0 ^ (r2 >>16)) & 0x0000FF00) | ((c1 ^ (r3 >>16)) & 0x000000FF);
- x3 = ((c3 ^ (r0 <<24)) & 0xFF000000) | ((c0 ^ (r1 >> 8)) & 0x00FF0000) | ((c1 ^ (r2 >> 8)) & 0x0000FF00) | ((c2 ^ (r3 >> 8)) & 0x000000FF);
+#define SMIX(x0, x1, x2, x3) { \
+ uint32_t tmp; \
+ uint32_t r0 = 0; \
+ uint32_t r1 = 0; \
+ uint32_t r2 = 0; \
+ uint32_t r3 = 0; \
+ uint32_t c0 = mixtab0(x0 >> 24); \
+ tmp = mixtab1((x0 >> 16) & 0xFF); \
+ c0 ^= tmp; \
+ r1 ^= tmp; \
+ tmp = mixtab2((x0 >> 8) & 0xFF); \
+ c0 ^= tmp; \
+ r2 ^= tmp; \
+ tmp = mixtab3(x0 & 0xFF); \
+ c0 ^= tmp; \
+ r3 ^= tmp; \
+ tmp = mixtab0(x1 >> 24); \
+ uint32_t c1 = tmp; \
+ r0 ^= tmp; \
+ tmp = mixtab1((x1 >> 16) & 0xFF); \
+ c1 ^= tmp; \
+ tmp = mixtab2((x1 >> 8) & 0xFF); \
+ c1 ^= tmp; \
+ r2 ^= tmp; \
+ tmp = mixtab3(x1 & 0xFF); \
+ c1 ^= tmp; \
+ r3 ^= tmp; \
+ tmp = mixtab0(x2 >> 24); \
+ uint32_t c2 = tmp; \
+ r0 ^= tmp; \
+ tmp = mixtab1((x2 >> 16) & 0xFF); \
+ c2 ^= tmp; \
+ r1 ^= tmp; \
+ tmp = mixtab2((x2 >> 8) & 0xFF); \
+ c2 ^= tmp; \
+ tmp = mixtab3(x2 & 0xFF); \
+ c2 ^= tmp; \
+ r3 ^= tmp; \
+ tmp = mixtab0(x3 >> 24); \
+ uint32_t c3 = tmp; \
+ r0 ^= tmp; \
+ tmp = mixtab1((x3 >> 16) & 0xFF); \
+ c3 ^= tmp; \
+ r1 ^= tmp; \
+ tmp = mixtab2((x3 >> 8) & 0xFF); \
+ c3 ^= tmp; \
+ r2 ^= tmp; \
+ tmp = mixtab3(x3 & 0xFF); \
+ c3 ^= tmp; \
+ x0 = ((c0 ^ r0) & 0xFF000000) | ((c1 ^ r1) & 0x00FF0000) \
+ | ((c2 ^ r2) & 0x0000FF00) | ((c3 ^ r3) & 0x000000FF); \
+ x1 = ((c1 ^ (r0 << 8)) & 0xFF000000) | ((c2 ^ (r1 << 8)) & 0x00FF0000) \
+ | ((c3 ^ (r2 << 8)) & 0x0000FF00) | ((c0 ^ (r3 >> 24)) & 0x000000FF); \
+ x2 = ((c2 ^ (r0 << 16)) & 0xFF000000) | ((c3 ^ (r1 << 16)) & 0x00FF0000) \
+ | ((c0 ^ (r2 >> 16)) & 0x0000FF00) | ((c1 ^ (r3 >> 16)) & 0x000000FF); \
+ x3 = ((c3 ^ (r0 << 24)) & 0xFF000000) | ((c0 ^ (r1 >> 8)) & 0x00FF0000) \
+ | ((c1 ^ (r2 >> 8)) & 0x0000FF00) | ((c2 ^ (r3 >> 8)) & 0x000000FF); \
}
-__device__
-static void SMIX_LDG(const uint32_t shared[4][256], uint32_t &x0,uint32_t &x1,uint32_t &x2,uint32_t &x3){
- uint32_t c0 = __ldg(&mixtab0[__byte_perm(x0,0,0x4443)]);
- uint32_t r1 = mixtab1(__byte_perm(x0,0,0x4442));
- uint32_t r2 = mixtab2(__byte_perm(x0,0,0x4441));
- uint32_t r3 = mixtab3(__byte_perm(x0,0,0x4440));
- c0 = c0 ^ r1 ^ r2 ^ r3;
- uint32_t r0 = __ldg(&mixtab0[__byte_perm(x1,0,0x4443)]);
- uint32_t c1 = r0 ^ mixtab1(__byte_perm(x1,0,0x4442));
- uint32_t tmp = mixtab2(__byte_perm(x1,0,0x4441));
- c1 ^= tmp;
- r2 ^= tmp;
- tmp = mixtab3(__byte_perm(x1,0,0x4440));
- c1 ^= tmp;
- r3 ^= tmp;
- uint32_t c2 = __ldg(&mixtab0[__byte_perm(x2,0,0x4443)]);
- r0 ^= c2;
- tmp = mixtab1(__byte_perm(x2,0,0x4442));
- c2 ^= tmp;
- r1 ^= tmp;
- tmp = mixtab2(__byte_perm(x2,0,0x4441));
- c2 ^= tmp;
- tmp = mixtab3(__byte_perm(x2,0,0x4440));
- c2 ^= tmp;
- r3 ^= tmp;
- uint32_t c3 = __ldg(&mixtab0[__byte_perm(x3,0,0x4443)]);
- r0 ^= c3;
- tmp = mixtab1(__byte_perm(x3,0,0x4442));
- c3 ^= tmp;
- r1 ^= tmp;
- tmp = mixtab2(__byte_perm(x3,0,0x4441));
- c3 ^= tmp;
- r2 ^= tmp;
- tmp = ROL8(__ldg(&mixtab0[__byte_perm(x3,0,0x4440)]));
- c3 ^= tmp;
- x0 = ((c0 ^ (r0 << 0)) & 0xFF000000) | ((c1 ^ (r1 << 0)) & 0x00FF0000) | ((c2 ^ (r2 << 0)) & 0x0000FF00) | ((c3 ^ (r3 << 0)) & 0x000000FF);
- x1 = ((c1 ^ (r0 << 8)) & 0xFF000000) | ((c2 ^ (r1 << 8)) & 0x00FF0000) | ((c3 ^ (r2 << 8)) & 0x0000FF00) | ((c0 ^ (r3 >>24)) & 0x000000FF);
- x2 = ((c2 ^ (r0 <<16)) & 0xFF000000) | ((c3 ^ (r1 <<16)) & 0x00FF0000) | ((c0 ^ (r2 >>16)) & 0x0000FF00) | ((c1 ^ (r3 >>16)) & 0x000000FF);
- x3 = ((c3 ^ (r0 <<24)) & 0xFF000000) | ((c0 ^ (r1 >> 8)) & 0x00FF0000) | ((c1 ^ (r2 >> 8)) & 0x0000FF00) | ((c2 ^ (r3 >> 8)) & 0x000000FF);
+#define SUB_ROR3 { \
+ B33 = S33, B34 = S34, B35 = S35; \
+ S35 = S32; S34 = S31; S33 = S30; S32 = S29; S31 = S28; S30 = S27; S29 = S26; S28 = S25; S27 = S24; \
+ S26 = S23; S25 = S22; S24 = S21; S23 = S20; S22 = S19; S21 = S18; S20 = S17; S19 = S16; S18 = S15; \
+ S17 = S14; S16 = S13; S15 = S12; S14 = S11; S13 = S10; S12 = S09; S11 = S08; S10 = S07; S09 = S06; \
+ S08 = S05; S07 = S04; S06 = S03; S05 = S02; S04 = S01; S03 = S00; S02 = B35; S01 = B34; S00 = B33; \
}
-#define mROR3 { \
- B[ 6] = S[33], B[ 7] = S[34], B[ 8] = S[35]; \
- S[35] = S[32]; S[34] = S[31]; S[33] = S[30]; S[32] = S[29]; S[31] = S[28]; S[30] = S[27]; S[29] = S[26]; S[28] = S[25]; S[27] = S[24]; \
- S[26] = S[23]; S[25] = S[22]; S[24] = S[21]; S[23] = S[20]; S[22] = S[19]; S[21] = S[18]; S[20] = S[17]; S[19] = S[16]; S[18] = S[15]; \
- S[17] = S[14]; S[16] = S[13]; S[15] = S[12]; S[14] = S[11]; S[13] = S[10]; S[12] = S[ 9]; S[11] = S[ 8]; S[10] = S[ 7]; S[ 9] = S[ 6]; \
- S[ 8] = S[ 5]; S[ 7] = S[ 4]; S[ 6] = S[ 3]; S[ 5] = S[ 2]; S[ 4] = S[ 1]; S[ 3] = S[ 0]; S[ 2] = B[ 8]; S[ 1] = B[ 7]; S[ 0] = B[ 6]; \
- }
-#define mROR8 { \
- B[ 1] = S[28], B[ 2] = S[29], B[ 3] = S[30], B[ 4] = S[31], B[ 5] = S[32], B[ 6] = S[33], B[ 7] = S[34], B[ 8] = S[35]; \
- S[35] = S[27]; S[34] = S[26]; S[33] = S[25]; S[32] = S[24]; S[31] = S[23]; S[30] = S[22]; S[29] = S[21]; S[28] = S[20]; S[27] = S[19]; \
- S[26] = S[18]; S[25] = S[17]; S[24] = S[16]; S[23] = S[15]; S[22] = S[14]; S[21] = S[13]; S[20] = S[12]; S[19] = S[11]; S[18] = S[10]; \
- S[17] = S[ 9]; S[16] = S[ 8]; S[15] = S[ 7]; S[14] = S[ 6]; S[13] = S[ 5]; S[12] = S[ 4]; S[11] = S[ 3]; S[10] = S[ 2]; S[ 9] = S[ 1]; \
- S[ 8] = S[ 0]; S[ 7] = B[ 8]; S[ 6] = B[ 7]; S[ 5] = B[ 6]; S[ 4] = B[ 5]; S[ 3] = B[ 4]; S[ 2] = B[ 3]; S[ 1] = B[ 2]; S[ 0] = B[ 1]; \
- }
+#define SUB_ROR8 { \
+ B28 = S28, B29 = S29, B30 = S30, B31 = S31, B32 = S32, B33 = S33, B34 = S34, B35 = S35; \
+ S35 = S27; S34 = S26; S33 = S25; S32 = S24; S31 = S23; S30 = S22; S29 = S21; S28 = S20; S27 = S19; \
+ S26 = S18; S25 = S17; S24 = S16; S23 = S15; S22 = S14; S21 = S13; S20 = S12; S19 = S11; S18 = S10; \
+ S17 = S09; S16 = S08; S15 = S07; S14 = S06; S13 = S05; S12 = S04; S11 = S03; S10 = S02; S09 = S01; \
+ S08 = S00; S07 = B35; S06 = B34; S05 = B33; S04 = B32; S03 = B31; S02 = B30; S01 = B29; S00 = B28; \
+}
-#define mROR9 { \
- B[ 0] = S[27], B[ 1] = S[28], B[ 2] = S[29], B[ 3] = S[30], B[ 4] = S[31], B[ 5] = S[32], B[ 6] = S[33], B[ 7] = S[34], B[ 8] = S[35]; \
- S[35] = S[26]; S[34] = S[25]; S[33] = S[24]; S[32] = S[23]; S[31] = S[22]; S[30] = S[21]; S[29] = S[20]; S[28] = S[19]; S[27] = S[18]; \
- S[26] = S[17]; S[25] = S[16]; S[24] = S[15]; S[23] = S[14]; S[22] = S[13]; S[21] = S[12]; S[20] = S[11]; S[19] = S[10]; S[18] = S[ 9]; \
- S[17] = S[ 8]; S[16] = S[ 7]; S[15] = S[ 6]; S[14] = S[ 5]; S[13] = S[ 4]; S[12] = S[ 3]; S[11] = S[ 2]; S[10] = S[ 1]; S[ 9] = S[ 0]; \
- S[ 8] = B[ 8]; S[ 7] = B[ 7]; S[ 6] = B[ 6]; S[ 5] = B[ 5]; S[ 4] = B[ 4]; S[ 3] = B[ 3]; S[ 2] = B[ 2]; S[ 1] = B[ 1]; S[ 0] = B[ 0]; \
- }
+#define SUB_ROR9 { \
+ B27 = S27, B28 = S28, B29 = S29, B30 = S30, B31 = S31, B32 = S32, B33 = S33, B34 = S34, B35 = S35; \
+ S35 = S26; S34 = S25; S33 = S24; S32 = S23; S31 = S22; S30 = S21; S29 = S20; S28 = S19; S27 = S18; \
+ S26 = S17; S25 = S16; S24 = S15; S23 = S14; S22 = S13; S21 = S12; S20 = S11; S19 = S10; S18 = S09; \
+ S17 = S08; S16 = S07; S15 = S06; S14 = S05; S13 = S04; S12 = S03; S11 = S02; S10 = S01; S09 = S00; \
+ S08 = B35; S07 = B34; S06 = B33; S05 = B32; S04 = B31; S03 = B30; S02 = B29; S01 = B28; S00 = B27; \
+}
-#define FUGUE512_3(x, y, z) { \
- TIX4(x, S[ 0], S[ 1], S[ 4], S[ 7], S[ 8], S[22], S[24], S[27], S[30]); \
- CMIX36(S[33], S[34], S[35], S[ 1], S[ 2], S[ 3], S[15], S[16], S[17]); \
- SMIX_LDG(shared, S[33], S[34], S[35], S[ 0]); \
- CMIX36(S[30], S[31], S[32], S[34], S[35], S[ 0], S[12], S[13], S[14]); \
- SMIX_LDG(shared, S[30], S[31], S[32], S[33]); \
- CMIX36(S[27], S[28], S[29], S[31], S[32], S[33], S[ 9], S[10], S[11]); \
- SMIX(shared, S[27], S[28], S[29], S[30]); \
- CMIX36(S[24], S[25], S[26], S[28], S[29], S[30], S[ 6], S[ 7], S[ 8]); \
- SMIX_LDG(shared, S[24], S[25], S[26], S[27]); \
- \
- TIX4(y, S[24], S[25], S[28], S[31], S[32], S[10], S[12], S[15], S[18]); \
- CMIX36(S[21], S[22], S[23], S[25], S[26], S[27], S[ 3], S[ 4], S[ 5]); \
- SMIX(shared, S[21], S[22], S[23], S[24]); \
- CMIX36(S[18], S[19], S[20], S[22], S[23], S[24], S[ 0], S[ 1], S[ 2]); \
- SMIX_LDG(shared, S[18], S[19], S[20], S[21]); \
- CMIX36(S[15], S[16], S[17], S[19], S[20], S[21], S[33], S[34], S[35]); \
- SMIX_LDG(shared, S[15], S[16], S[17], S[18]); \
- CMIX36(S[12], S[13], S[14], S[16], S[17], S[18], S[30], S[31], S[32]); \
- SMIX(shared, S[12], S[13], S[14], S[15]); \
- \
- TIX4(z, S[12], S[13], S[16], S[19], S[20], S[34], S[ 0], S[ 3], S[ 6]); \
- CMIX36(S[ 9], S[10], S[11], S[13], S[14], S[15], S[27], S[28], S[29]); \
- SMIX_LDG(shared, S[ 9], S[10], S[11], S[12]); \
- CMIX36(S[ 6], S[ 7], S[ 8], S[10], S[11], S[12], S[24], S[25], S[26]); \
- SMIX_LDG(shared, S[ 6], S[ 7], S[ 8], S[ 9]); \
- CMIX36(S[ 3], S[ 4], S[ 5], S[ 7], S[ 8], S[ 9], S[21], S[22], S[23]); \
- SMIX_LDG(shared, S[ 3], S[ 4], S[ 5], S[ 6]); \
- CMIX36(S[ 0], S[ 1], S[ 2], S[ 4], S[ 5], S[ 6], S[18], S[19], S[20]); \
- SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]); \
- }
+#define FUGUE512_3(x, y, z) { \
+ TIX4(x, S00, S01, S04, S07, S08, S22, S24, S27, S30); \
+ CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \
+ SMIX(S33, S34, S35, S00); \
+ CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \
+ SMIX(S30, S31, S32, S33); \
+ CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \
+ SMIX(S27, S28, S29, S30); \
+ CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \
+ SMIX(S24, S25, S26, S27); \
+ \
+ TIX4(y, S24, S25, S28, S31, S32, S10, S12, S15, S18); \
+ CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05); \
+ SMIX(S21, S22, S23, S24); \
+ CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02); \
+ SMIX(S18, S19, S20, S21); \
+ CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35); \
+ SMIX(S15, S16, S17, S18); \
+ CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32); \
+ SMIX(S12, S13, S14, S15); \
+ \
+ TIX4(z, S12, S13, S16, S19, S20, S34, S00, S03, S06); \
+ CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29); \
+ SMIX(S09, S10, S11, S12); \
+ CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26); \
+ SMIX(S06, S07, S08, S09); \
+ CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23); \
+ SMIX(S03, S04, S05, S06); \
+ CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); \
+ SMIX(S00, S01, S02, S03); \
+}
+
+
+#define AS_UINT4(addr) *((uint4*)(addr))
/***************************************************/
-// Die Hash-Funktion
-__global__ __launch_bounds__(256,3)
+__global__
+__launch_bounds__(TPB)
void x13_fugue512_gpu_hash_64(uint32_t threads, uint64_t *g_hash)
{
- __shared__ uint32_t shared[4][256];
-
-// if(threadIdx.x<256){
- const uint32_t tmp = mixtab0[threadIdx.x];
- shared[0][threadIdx.x] = tmp;
- shared[1][threadIdx.x] = ROR8(tmp);
- shared[2][threadIdx.x] = ROL16(tmp);
- shared[3][threadIdx.x] = ROL8(tmp);
-// }
- __syncthreads();
- const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
- if (thread < threads)
- {
- uint32_t *hash = (uint32_t*)&g_hash[thread<<3];
-
- uint32_t S[36];
- uint32_t B[ 9];
-
- uint32_t Hash[16];
-
- *(uint2x4*)&Hash[0] = __ldg4((uint2x4*)&hash[0]);
- *(uint2x4*)&Hash[8] = __ldg4((uint2x4*)&hash[8]);
- __syncthreads();
-
- S[ 0] = S[ 1] = S[ 2] = S[ 3] = S[ 4] = S[ 5] = S[ 6] = S[ 7] = S[ 8] = S[ 9] = S[10] = S[11] = S[12] = S[13] = S[14] = S[15] = S[16] = S[17] = S[18] = S[19] = 0;
- *(uint2x4*)&S[20] = *(uint2x4*)&c_S[ 0];
- *(uint2x4*)&S[28] = *(uint2x4*)&c_S[ 8];
-
- FUGUE512_3(Hash[0x0], Hash[0x1], Hash[0x2]);
- FUGUE512_3(Hash[0x3], Hash[0x4], Hash[0x5]);
- FUGUE512_3(Hash[0x6], Hash[0x7], Hash[0x8]);
- FUGUE512_3(Hash[0x9], Hash[0xA], Hash[0xB]);
- FUGUE512_3(Hash[0xC], Hash[0xD], Hash[0xE]);
- FUGUE512_3(Hash[0xF], 0U, 512U);
-
- for (uint32_t i = 0; i < 32; i+=2){
- mROR3;
- CMIX36(S[ 0], S[ 1], S[ 2], S[ 4], S[ 5], S[ 6], S[18], S[19], S[20]);
- SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
- mROR3;
- CMIX36(S[ 0], S[ 1], S[ 2], S[ 4], S[ 5], S[ 6], S[18], S[19], S[20]);
- SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
- }
- #pragma unroll
- for (uint32_t i = 0; i < 13; i ++) {
- S[ 4] ^= S[ 0]; S[ 9] ^= S[ 0]; S[18] ^= S[ 0]; S[27] ^= S[ 0];
- mROR9;
- SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
- S[ 4] ^= S[ 0]; S[10] ^= S[ 0]; S[18] ^= S[ 0]; S[27] ^= S[ 0];
- mROR9;
- SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
- S[ 4] ^= S[ 0]; S[10] ^= S[ 0]; S[19] ^= S[ 0]; S[27] ^= S[ 0];
- mROR9;
- SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
- S[ 4] ^= S[ 0]; S[10] ^= S[ 0]; S[19] ^= S[ 0]; S[28] ^= S[ 0];
- mROR8;
- SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
- }
- S[ 4] ^= S[ 0]; S[ 9] ^= S[ 0]; S[18] ^= S[ 0]; S[27] ^= S[ 0];
-
- S[ 0] = cuda_swab32(S[ 1]); S[ 1] = cuda_swab32(S[ 2]); S[ 2] = cuda_swab32(S[ 3]); S[ 3] = cuda_swab32(S[ 4]);
- S[ 4] = cuda_swab32(S[ 9]); S[ 5] = cuda_swab32(S[10]); S[ 6] = cuda_swab32(S[11]); S[ 7] = cuda_swab32(S[12]);
- S[ 8] = cuda_swab32(S[18]); S[ 9] = cuda_swab32(S[19]); S[10] = cuda_swab32(S[20]); S[11] = cuda_swab32(S[21]);
- S[12] = cuda_swab32(S[27]); S[13] = cuda_swab32(S[28]); S[14] = cuda_swab32(S[29]); S[15] = cuda_swab32(S[30]);
-
- *(uint2x4*)&hash[ 0] = *(uint2x4*)&S[ 0];
- *(uint2x4*)&hash[ 8] = *(uint2x4*)&S[ 8];
+ __shared__ uint32_t mixtabs[1024];
+
+ // load shared mem (with 256 threads)
+ const uint32_t thr = threadIdx.x & 0xFF;
+ const uint32_t tmp = tex1Dfetch(mixTab0Tex, thr);
+ mixtabs[thr] = tmp;
+ mixtabs[thr+256] = ROR8(tmp);
+ mixtabs[thr+512] = ROL16(tmp);
+ mixtabs[thr+768] = ROL8(tmp);
+#if TPB <= 256
+ if (blockDim.x < 256) {
+ const uint32_t thr = (threadIdx.x + 0x80) & 0xFF;
+ const uint32_t tmp = tex1Dfetch(mixTab0Tex, thr);
+ mixtabs[thr] = tmp;
+ mixtabs[thr + 256] = ROR8(tmp);
+ mixtabs[thr + 512] = ROL16(tmp);
+ mixtabs[thr + 768] = ROL8(tmp);
}
-}
+#endif
-/***************************************************/
-// The final hash function
-__global__ __launch_bounds__(512,2) /* force 56 registers */
-void x13_fugue512_gpu_hash_64_final(uint32_t threads,const uint32_t* __restrict__ g_hash,uint32_t* resNonce, const uint64_t target){
-
- __shared__ uint32_t shared[4][256];
-
- if(threadIdx.x<256){
- const uint32_t tmp = mixtab0[threadIdx.x];
- shared[0][threadIdx.x] = tmp;
- shared[1][threadIdx.x] = ROR8(tmp);
- shared[2][threadIdx.x] = ROL16(tmp);
- shared[3][threadIdx.x] = ROL8(tmp);
- }
+ __syncthreads();
- const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+ uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
- const uint32_t* __restrict__ hash = &g_hash[thread<<4];
-
- uint32_t S[36];
- uint32_t B[ 9];
+ const size_t hashPosition = thread;
+ uint64_t*pHash = &g_hash[hashPosition<<3];
uint32_t Hash[16];
-
- *(uint2x4*)&Hash[0] = __ldg4((uint2x4*)&hash[0]);
- *(uint2x4*)&Hash[8] = __ldg4((uint2x4*)&hash[8]);
- __syncthreads();
- S[ 0] = S[ 1] = S[ 2] = S[ 3] = S[ 4] = S[ 5] = S[ 6] = S[ 7] = S[ 8] = S[ 9] = S[10] = S[11] = S[12] = S[13] = S[14] = S[15] = S[16] = S[17] = S[18] = S[19] = 0;
- *(uint2x4*)&S[20] = *(uint2x4*)&c_S[ 0];
- *(uint2x4*)&S[28] = *(uint2x4*)&c_S[ 8];
-
- FUGUE512_3(Hash[0x0], Hash[0x1], Hash[0x2]);
- FUGUE512_3(Hash[0x3], Hash[0x4], Hash[0x5]);
- FUGUE512_3(Hash[0x6], Hash[0x7], Hash[0x8]);
- FUGUE512_3(Hash[0x9], Hash[0xA], Hash[0xB]);
- FUGUE512_3(Hash[0xC], Hash[0xD], Hash[0xE]);
- FUGUE512_3(Hash[0xF], 0, 512);
-
- for (int i = 0; i < 32; i++){
- mROR3;
- CMIX36(S[ 0], S[ 1], S[ 2], S[ 4], S[ 5], S[ 6], S[18], S[19], S[20]);
- SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
- }
- #pragma unroll
- for (int i = 0; i < 12; i++) {
- S[ 4] ^= S[ 0]; S[ 9] ^= S[ 0]; S[18] ^= S[ 0]; S[27] ^= S[ 0];
- mROR9;
- SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
- S[ 4] ^= S[ 0]; S[10] ^= S[ 0]; S[18] ^= S[ 0]; S[27] ^= S[ 0];
- mROR9;
- SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
- S[ 4] ^= S[ 0]; S[10] ^= S[ 0]; S[19] ^= S[ 0]; S[27] ^= S[ 0];
- mROR9;
- SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
- S[ 4] ^= S[ 0]; S[10] ^= S[ 0]; S[19] ^= S[ 0]; S[28] ^= S[ 0];
- mROR8;
- SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
+
+ #pragma unroll 4
+ for(int i = 0; i < 4; i++)
+ AS_UINT4(&Hash[i*4]) = AS_UINT4(&pHash[i*2]);
+
+ #pragma unroll 16
+ for(int i = 0; i < 16; i++)
+ Hash[i] = cuda_swab32(Hash[i]);
+
+ uint32_t S00, S01, S02, S03, S04, S05, S06, S07, S08, S09;
+ uint32_t S10, S11, S12, S13, S14, S15, S16, S17, S18, S19;
+ uint32_t S20, S21, S22, S23, S24, S25, S26, S27, S28, S29;
+ uint32_t S30, S31, S32, S33, S34, S35;
+
+ uint32_t B27, B28, B29, B30, B31, B32, B33, B34, B35;
+ //const uint64_t bc = (64ULL << 3); // 512
+ //const uint32_t bclo = (uint32_t)(bc);
+ //const uint32_t bchi = (uint32_t)(bc >> 32);
+
+ S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = 0;
+ S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0;
+ S20 = 0x8807a57e; S21 = 0xe616af75; S22 = 0xc5d3e4db; S23 = 0xac9ab027;
+ S24 = 0xd915f117; S25 = 0xb6eecc54; S26 = 0x06e8020b; S27 = 0x4a92efd1;
+ S28 = 0xaac6e2c9; S29 = 0xddb21398; S30 = 0xcae65838; S31 = 0x437f203f;
+ S32 = 0x25ea78e7; S33 = 0x951fddd6; S34 = 0xda6ed11d; S35 = 0xe13e3567;
+
+ FUGUE512_3((Hash[0x0]), (Hash[0x1]), (Hash[0x2]));
+ FUGUE512_3((Hash[0x3]), (Hash[0x4]), (Hash[0x5]));
+ FUGUE512_3((Hash[0x6]), (Hash[0x7]), (Hash[0x8]));
+ FUGUE512_3((Hash[0x9]), (Hash[0xA]), (Hash[0xB]));
+ FUGUE512_3((Hash[0xC]), (Hash[0xD]), (Hash[0xE]));
+ FUGUE512_3((Hash[0xF]), 0u /*bchi*/, 512u /*bclo*/);
+
+ #pragma unroll 32
+ for (int i = 0; i < 32; i ++) {
+ SUB_ROR3;
+ CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20);
+ SMIX(S00, S01, S02, S03);
}
- S[ 4] ^= S[ 0]; S[ 9] ^= S[ 0]; S[18] ^= S[ 0]; S[27] ^= S[ 0];
- mROR9;
- SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
- S[ 4] ^= S[ 0]; S[10] ^= S[ 0]; S[18] ^= S[ 0]; S[27] ^= S[ 0];
- mROR9;
- SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
- S[ 4] ^= S[ 0]; S[10] ^= S[ 0]; S[19] ^= S[ 0]; S[27] ^= S[ 0];
- mROR9;
- SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
-
- S[ 3] = cuda_swab32(S[3]); S[ 4] = cuda_swab32(S[4]^S[ 0]);
-
- const uint64_t check = *(uint64_t*)&S[ 3];
- if(check <= target){
- uint32_t tmp = atomicExch(&resNonce[0], thread);
- if (tmp != UINT32_MAX)
- resNonce[1] = tmp;
+ #pragma unroll 13
+ for (int i = 0; i < 13; i++) {
+ S04 ^= S00;
+ S09 ^= S00;
+ S18 ^= S00;
+ S27 ^= S00;
+ SUB_ROR9;
+ SMIX(S00, S01, S02, S03);
+ S04 ^= S00;
+ S10 ^= S00;
+ S18 ^= S00;
+ S27 ^= S00;
+ SUB_ROR9;
+ SMIX(S00, S01, S02, S03);
+ S04 ^= S00;
+ S10 ^= S00;
+ S19 ^= S00;
+ S27 ^= S00;
+ SUB_ROR9;
+ SMIX(S00, S01, S02, S03);
+ S04 ^= S00;
+ S10 ^= S00;
+ S19 ^= S00;
+ S28 ^= S00;
+ SUB_ROR8;
+ SMIX(S00, S01, S02, S03);
}
+ S04 ^= S00;
+ S09 ^= S00;
+ S18 ^= S00;
+ S27 ^= S00;
+
+ Hash[0] = cuda_swab32(S01);
+ Hash[1] = cuda_swab32(S02);
+ Hash[2] = cuda_swab32(S03);
+ Hash[3] = cuda_swab32(S04);
+ Hash[4] = cuda_swab32(S09);
+ Hash[5] = cuda_swab32(S10);
+ Hash[6] = cuda_swab32(S11);
+ Hash[7] = cuda_swab32(S12);
+ Hash[8] = cuda_swab32(S18);
+ Hash[9] = cuda_swab32(S19);
+ Hash[10] = cuda_swab32(S20);
+ Hash[11] = cuda_swab32(S21);
+ Hash[12] = cuda_swab32(S27);
+ Hash[13] = cuda_swab32(S28);
+ Hash[14] = cuda_swab32(S29);
+ Hash[15] = cuda_swab32(S30);
+
+ #pragma unroll 4
+ for(int i = 0; i < 4; i++)
+ AS_UINT4(&pHash[i*2]) = AS_UINT4(&Hash[i*4]);
}
}
-__host__
-void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash){
-
- const uint32_t threadsperblock = 256;
-
- // berechne wie viele Thread Blocks wir brauchen
- dim3 grid((threads + threadsperblock-1)/threadsperblock);
- dim3 block(threadsperblock);
+#define texDef(id, texname, texmem, texsource, texsize) { \
+ unsigned int *texmem; \
+ cudaMalloc(&texmem, texsize); \
+ d_textures[thr_id][id] = texmem; \
+ cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
+ texname.normalized = 0; \
+ texname.filterMode = cudaFilterModePoint; \
+ texname.addressMode[0] = cudaAddressModeClamp; \
+ { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); \
+ cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \
+ } \
+}
- x13_fugue512_gpu_hash_64<<>>(threads, (uint64_t*)d_hash);
+__host__
+void x13_fugue512_cpu_init(int thr_id, uint32_t threads)
+{
+ texDef(0, mixTab0Tex, mixTab0m, mixtab0, sizeof(uint32_t)*256);
}
__host__
-void x13_fugue512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target){
+void x13_fugue512_cpu_free(int thr_id)
+{
+ cudaFree(d_textures[thr_id][0]);
+}
- const uint32_t threadsperblock = 512;
+__host__
+//void fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash)
+void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash)
+{
+ const uint32_t threadsperblock = TPB;
- // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
- x13_fugue512_gpu_hash_64_final<<>>(threads, d_hash,d_resNonce,target);
+ x13_fugue512_gpu_hash_64 <<>> (threads, (uint64_t*)d_hash);
}