diff --git a/algos.h b/algos.h
index 1a16fb193c..29c6ef09a9 100644
--- a/algos.h
+++ b/algos.h
@@ -24,7 +24,7 @@ enum sha_algos {
 	ALGO_X11,
 	ALGO_X11EVO,
 	ALGO_C11,
-	ALGO_SIB,
+	ALGO_POLY,
 	ALGO_X13,
 	ALGO_X14,
 	ALGO_X15,
@@ -61,7 +61,7 @@ static const char *algo_names[] = {
 	"x11",
 	"x11evo",
 	"c11",
-	"sib",
+	"poly",
 	"x13",
 	"x14",
 	"x15",
diff --git a/bench.cpp b/bench.cpp
index e3926b0a6a..e6696bce3a 100644
--- a/bench.cpp
+++ b/bench.cpp
@@ -60,7 +60,7 @@ void algo_free_all(int thr_id){
 	free_x11(thr_id);
 	free_x11evo(thr_id);
 	free_c11(thr_id);
-	free_sib(thr_id);
+	free_poly(thr_id);
 	free_x13(thr_id);
 	free_x14(thr_id);
 	free_x15(thr_id);
diff --git a/ccminer.cpp b/ccminer.cpp
index cd9e7964b5..ac16cbb299 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -229,7 +229,7 @@ Options:\n\
 			qubit       Qubit\n\
 			x11         X11              (DarkCoin)\n\
 			c11         C11              (Chaincoin)\n\
-			sib         X11+gost         (Sibcoin)\n\
+			poly        Veltor+stuff     (Polytimos)\n\
 			x11evo      Permuted x11     (Revolver)\n\
 			x13         X13              (MaruCoin)\n\
 			x14         X14              (BernCoin)\n\
@@ -1928,7 +1928,7 @@ static void *miner_thread(void *userdata)
 					minmax = 0x8000000;
 					break;
 				case ALGO_NEOSCRYPT:
-				case ALGO_SIB:
+				case ALGO_POLY:
 				case ALGO_VELTOR:
 				case ALGO_LYRA2:
 					minmax = 0x80000;
@@ -2063,8 +2063,8 @@ static void *miner_thread(void *userdata)
 			case ALGO_HSR:
 				rc = scanhash_hsr(thr_id, &work, max_nonce, &hashes_done);
 				break;
-			case ALGO_SIB:
-				rc = scanhash_sib(thr_id, &work, max_nonce, &hashes_done);
+			case ALGO_POLY:
+				rc = scanhash_poly(thr_id, &work, max_nonce, &hashes_done);
 				break;
 			case ALGO_VELTOR:
 				rc = scanhash_veltor(thr_id, &work, max_nonce, &hashes_done);
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 10a006d03d..ceb7cf8f24 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -414,6 +414,7 @@
     <CudaCompile Include="x13\cuda_x13_hamsi_fugue512.cu">
       <MaxRegCount>72</MaxRegCount>
     </CudaCompile>
+    <CudaCompile Include="x13\cuda_x13_fugue512.cu">
     <CudaCompile Include="x13\hsr.cu" />
     <CudaCompile Include="x13\x13.cu" />
     <CudaCompile Include="x14\x14.cu" />
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index a3d68641da..15f4f287e6 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -628,6 +628,9 @@
     <CudaCompile Include="x13\cuda_x13_hamsi_fugue512.cu">
       <Filter>Source Files\CUDA\x13</Filter>
     </CudaCompile>
+    <CudaCompile Include="x13\cuda_x13_fugue512.cu">
+     <Filter>Source Files\CUDA\x13</Filter>
+    </CudaCompile>
     <CudaCompile Include="quark\cuda_keccak_skein512.cu">
       <Filter>Source Files\CUDA\quark</Filter>
     </CudaCompile>
diff --git a/miner.h b/miner.h
index 7624acb2e1..34ce49cba2 100644
--- a/miner.h
+++ b/miner.h
@@ -282,7 +282,7 @@ extern int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce, uns
 extern int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_x11evo(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
-extern int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_poly(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_x14(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_x15(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@@ -322,7 +322,7 @@ extern void free_whirl(int thr_id);
 extern void free_x11(int thr_id);
 extern void free_x11evo(int thr_id);
 extern void free_c11(int thr_id);
-extern void free_sib(int thr_id);
+extern void free_poly(int thr_id);
 extern void free_x13(int thr_id);
 extern void free_x14(int thr_id);
 extern void free_x15(int thr_id);
@@ -805,7 +805,7 @@ void wcoinhash(void *state, const void *input);
 void x11hash(void *output, const void *input);
 void x11evo_hash(void *output, const void *input);
 void c11hash(void *output, const void *input);
-void sibhash(void *output, const void *input);
+void polyhash(void *output, const void *input);
 void x13hash(void *output, const void *input);
 void x14hash(void *output, const void *input);
 void x15hash(void *output, const void *input);
diff --git a/quark/cuda_quark_skein512.cu b/quark/cuda_quark_skein512.cu
index 5db1873e61..b21ce5a8c7 100644
--- a/quark/cuda_quark_skein512.cu
+++ b/quark/cuda_quark_skein512.cu
@@ -420,8 +420,150 @@ void skein512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *outp
 	}
 
 }
-
 __host__
+void skein512_cpu_setBlock_80( void *pdata)
+{
+        uint64_t message[20];
+        memcpy(&message[0], pdata, 80);
+
+        uint64_t p[8];
+        uint64_t h[9];
+        uint64_t t0, t1, t2;
+
+        h[0] = 0x4903ADFF749C51CEull;
+        h[1] = 0x0D95DE399746DF03ull;
+        h[2] = 0x8FD1934127C79BCEull;
+        h[3] = 0x9A255629FF352CB1ull;
+        h[4] = 0x5DB62599DF6CA7B0ull;
+        h[5] = 0xEABE394CA9D5C3F4ull;
+        h[6] = 0x991112C71A75B523ull;
+        h[7] = 0xAE18A40B660FCC33ull;
+        // h[8] = h[0] ^ h[1] ^ h[2] ^ h[3] ^ h[4] ^ h[5] ^ h[6] ^ h[7] ^ SPH_C64(0x1BD11BDAA9FC1A22);
+        h[8] = 0xcab2076d98173ec4ULL;
+
+        t0 = 64; // ptr
+        t1 = 0x7000000000000000ull;
+        t2 = 0x7000000000000040ull;
+
+        memcpy(&p[0], &message[0], 64);
+
+        TFBIG_4e_PRE(0);
+        TFBIG_4o_PRE(1);
+        TFBIG_4e_PRE(2);
+        TFBIG_4o_PRE(3);
+        TFBIG_4e_PRE(4);
+        TFBIG_4o_PRE(5);
+        TFBIG_4e_PRE(6);
+        TFBIG_4o_PRE(7);
+        TFBIG_4e_PRE(8);
+        TFBIG_4o_PRE(9);
+        TFBIG_4e_PRE(10);
+        TFBIG_4o_PRE(11);
+        TFBIG_4e_PRE(12);
+        TFBIG_4o_PRE(13);
+        TFBIG_4e_PRE(14);
+        TFBIG_4o_PRE(15);
+        TFBIG_4e_PRE(16);
+        TFBIG_4o_PRE(17);
+        TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
+        message[10] = message[0] ^ p[0];
+        message[11] = message[1] ^ p[1];
+        message[12] = message[2] ^ p[2];
+        message[13] = message[3] ^ p[3];
+        message[14] = message[4] ^ p[4];
+        message[15] = message[5] ^ p[5];
+        message[16] = message[6] ^ p[6];
+        message[17] = message[7] ^ p[7];
+        message[18] = t2;
+
+        uint64_t buffer[128];
+
+//      buffer[ 0] = message[ 8];
+        buffer[ 0] = message[ 9];
+        h[0] = buffer[ 1] = message[10];
+        h[1] = buffer[ 2] = message[11];
+        h[2] = buffer[ 3] = message[12];
+        h[3] = buffer[ 4] = message[13];
+        h[4] = buffer[ 5] = message[14];
+        h[5] = buffer[ 6] = message[15];
+        h[6] = buffer[ 7] = message[16];
+        h[7] = buffer[ 8] = message[17];
+        h[8] = buffer[ 9] = h[0]^h[1]^h[2]^h[3]^h[4]^h[5]^h[6]^h[7]^0x1BD11BDAA9FC1A22ULL;
+
+        t0 = 0x50ull;
+        t1 = 0xB000000000000000ull;
+        t2 = t0^t1;
+
+        p[0] = message[ 8] + h[0];
+        p[2] = h[2]; p[3] = h[3]; p[4] = h[4];
+        p[5] = h[5] + t0;
+        p[6] = h[6] + t1;
+        p[7] = h[7];
+
+        p[2] += p[3];
+        p[4] += p[5]; p[6] += p[7];
+
+        p[3] = ROTL64(p[3], 36) ^ p[2];
+        p[5] = ROTL64(p[5], 19) ^ p[4];
+        p[7] = ROTL64(p[7], 37) ^ p[6];
+        p[4] += p[7]; p[6] += p[5];
+
+        p[7] = ROTL64(p[7], 27) ^ p[4];
+        p[5] = ROTL64(p[5], 14) ^ p[6];
+        buffer[10] = p[ 0];
+        buffer[11] = p[ 2];
+        buffer[12] = p[ 3];
+        buffer[13] = p[ 4];
+        buffer[14] = p[ 5];
+        buffer[15] = p[ 6];
+        buffer[16] = p[ 7];
+        buffer[17] = ROTL64(p[3], 42);
+        buffer[18] = ROTL64(p[5], 36);
+        buffer[19] = ROTL64(p[7], 39);
+
+        buffer[20] = h[6]+t1;
+        buffer[21] = h[8]+1;
+        buffer[22] = h[7]+t2;
+        buffer[23] = h[0]+2;
+        buffer[24] = h[8]+t0;
+        buffer[25] = h[1]+3;
+        buffer[26] = h[0]+t1;
+        buffer[27] = h[2]+4;
+        buffer[28] = h[1]+t2;
+        buffer[29] = h[3]+5;
+        buffer[30] = h[2]+t0;
+        buffer[31] = h[4]+6;
+        buffer[32] = h[3]+t1;
+        buffer[33] = h[5]+7;
+        buffer[34] = h[4]+t2;
+        buffer[35] = h[6]+8;
+        buffer[36] = h[5]+t0;
+        buffer[37] = h[7]+9;
+        buffer[38] = h[6]+t1;
+        buffer[39] = h[8]+10;
+        buffer[40] = h[7]+t2;
+        buffer[41] = h[0]+11;
+        buffer[42] = h[8]+t0;
+        buffer[43] = h[1]+12;
+        buffer[44] = h[0]+t1;
+        buffer[45] = h[2]+13;
+        buffer[46] = h[1]+t2;
+        buffer[47] = h[3]+14;
+        buffer[48] = h[2]+t0;
+        buffer[49] = h[4]+15;
+        buffer[50] = h[3]+t1;
+        buffer[51] = h[5]+16;
+        buffer[52] = h[4]+t2;
+        buffer[53] = h[6]+17;
+        buffer[54] = h[5]+t0;
+        buffer[55] = h[7]+18;
+        buffer[56] = h[6]+t1;
+
+        buffer[57] = message[ 8];
+
+        cudaMemcpyToSymbol(c_buffer, buffer, sizeof(c_buffer), 0, cudaMemcpyHostToDevice);
+}
+/*
 void skein512_cpu_setBlock_80(void *pdata)
 {
 	uint64_t message[20];
@@ -564,7 +706,7 @@ void skein512_cpu_setBlock_80(void *pdata)
 
 	CUDA_SAFE_CALL(cudaGetLastError());
 }
-
+*/
 __host__
 void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *g_hash)
 {
diff --git a/quark/skein_header.h b/quark/skein_header.h
new file mode 100644
index 0000000000..460b311f57
--- /dev/null
+++ b/quark/skein_header.h
@@ -0,0 +1,385 @@
+/* Elementary defines for SKEIN */
+
+/*
+ * M9_ ## s ## _ ## i  evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7).
+ */
+
+#define M9_0_0    0
+#define M9_0_1    1
+#define M9_0_2    2
+#define M9_0_3    3
+#define M9_0_4    4
+#define M9_0_5    5
+#define M9_0_6    6
+#define M9_0_7    7
+
+#define M9_1_0    1
+#define M9_1_1    2
+#define M9_1_2    3
+#define M9_1_3    4
+#define M9_1_4    5
+#define M9_1_5    6
+#define M9_1_6    7
+#define M9_1_7    8
+
+#define M9_2_0    2
+#define M9_2_1    3
+#define M9_2_2    4
+#define M9_2_3    5
+#define M9_2_4    6
+#define M9_2_5    7
+#define M9_2_6    8
+#define M9_2_7    0
+
+#define M9_3_0    3
+#define M9_3_1    4
+#define M9_3_2    5
+#define M9_3_3    6
+#define M9_3_4    7
+#define M9_3_5    8
+#define M9_3_6    0
+#define M9_3_7    1
+
+#define M9_4_0    4
+#define M9_4_1    5
+#define M9_4_2    6
+#define M9_4_3    7
+#define M9_4_4    8
+#define M9_4_5    0
+#define M9_4_6    1
+#define M9_4_7    2
+
+#define M9_5_0    5
+#define M9_5_1    6
+#define M9_5_2    7
+#define M9_5_3    8
+#define M9_5_4    0
+#define M9_5_5    1
+#define M9_5_6    2
+#define M9_5_7    3
+
+#define M9_6_0    6
+#define M9_6_1    7
+#define M9_6_2    8
+#define M9_6_3    0
+#define M9_6_4    1
+#define M9_6_5    2
+#define M9_6_6    3
+#define M9_6_7    4
+
+#define M9_7_0    7
+#define M9_7_1    8
+#define M9_7_2    0
+#define M9_7_3    1
+#define M9_7_4    2
+#define M9_7_5    3
+#define M9_7_6    4
+#define M9_7_7    5
+
+#define M9_8_0    8
+#define M9_8_1    0
+#define M9_8_2    1
+#define M9_8_3    2
+#define M9_8_4    3
+#define M9_8_5    4
+#define M9_8_6    5
+#define M9_8_7    6
+
+#define M9_9_0    0
+#define M9_9_1    1
+#define M9_9_2    2
+#define M9_9_3    3
+#define M9_9_4    4
+#define M9_9_5    5
+#define M9_9_6    6
+#define M9_9_7    7
+
+#define M9_10_0   1
+#define M9_10_1   2
+#define M9_10_2   3
+#define M9_10_3   4
+#define M9_10_4   5
+#define M9_10_5   6
+#define M9_10_6   7
+#define M9_10_7   8
+
+#define M9_11_0   2
+#define M9_11_1   3
+#define M9_11_2   4
+#define M9_11_3   5
+#define M9_11_4   6
+#define M9_11_5   7
+#define M9_11_6   8
+#define M9_11_7   0
+
+#define M9_12_0   3
+#define M9_12_1   4
+#define M9_12_2   5
+#define M9_12_3   6
+#define M9_12_4   7
+#define M9_12_5   8
+#define M9_12_6   0
+#define M9_12_7   1
+
+#define M9_13_0   4
+#define M9_13_1   5
+#define M9_13_2   6
+#define M9_13_3   7
+#define M9_13_4   8
+#define M9_13_5   0
+#define M9_13_6   1
+#define M9_13_7   2
+
+#define M9_14_0   5
+#define M9_14_1   6
+#define M9_14_2   7
+#define M9_14_3   8
+#define M9_14_4   0
+#define M9_14_5   1
+#define M9_14_6   2
+#define M9_14_7   3
+
+#define M9_15_0   6
+#define M9_15_1   7
+#define M9_15_2   8
+#define M9_15_3   0
+#define M9_15_4   1
+#define M9_15_5   2
+#define M9_15_6   3
+#define M9_15_7   4
+
+#define M9_16_0   7
+#define M9_16_1   8
+#define M9_16_2   0
+#define M9_16_3   1
+#define M9_16_4   2
+#define M9_16_5   3
+#define M9_16_6   4
+#define M9_16_7   5
+
+#define M9_17_0   8
+#define M9_17_1   0
+#define M9_17_2   1
+#define M9_17_3   2
+#define M9_17_4   3
+#define M9_17_5   4
+#define M9_17_6   5
+#define M9_17_7   6
+
+#define M9_18_0   0
+#define M9_18_1   1
+#define M9_18_2   2
+#define M9_18_3   3
+#define M9_18_4   4
+#define M9_18_5   5
+#define M9_18_6   6
+#define M9_18_7   7
+
+/*
+ * M3_ ## s ## _ ## i  evaluates to s+i mod 3 (0 <= s <= 18, 0 <= i <= 1).
+ */
+
+#define M3_0_0    0
+#define M3_0_1    1
+#define M3_1_0    1
+#define M3_1_1    2
+#define M3_2_0    2
+#define M3_2_1    0
+#define M3_3_0    0
+#define M3_3_1    1
+#define M3_4_0    1
+#define M3_4_1    2
+#define M3_5_0    2
+#define M3_5_1    0
+#define M3_6_0    0
+#define M3_6_1    1
+#define M3_7_0    1
+#define M3_7_1    2
+#define M3_8_0    2
+#define M3_8_1    0
+#define M3_9_0    0
+#define M3_9_1    1
+#define M3_10_0   1
+#define M3_10_1   2
+#define M3_11_0   2
+#define M3_11_1   0
+#define M3_12_0   0
+#define M3_12_1   1
+#define M3_13_0   1
+#define M3_13_1   2
+#define M3_14_0   2
+#define M3_14_1   0
+#define M3_15_0   0
+#define M3_15_1   1
+#define M3_16_0   1
+#define M3_16_1   2
+#define M3_17_0   2
+#define M3_17_1   0
+#define M3_18_0   0
+#define M3_18_1   1
+
+#define XCAT(x, y)     XCAT_(x, y)
+#define XCAT_(x, y)    x ## y
+
+#define SKBI(k, s, i)   XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i))
+#define SKBT(t, s, v)   XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v))
+
+#define TFBIG_ADDKEY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \
+		w0 = (w0 + SKBI(k, s, 0)); \
+		w1 = (w1 + SKBI(k, s, 1)); \
+		w2 = (w2 + SKBI(k, s, 2)); \
+		w3 = (w3 + SKBI(k, s, 3)); \
+		w4 = (w4 + SKBI(k, s, 4)); \
+		w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \
+		w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \
+		w7 = (w7 + SKBI(k, s, 7) + make_uint2(s,0); \
+	}
+
+#define TFBIG_MIX(x0, x1, rc) { \
+		x0 = x0 + x1; \
+		x1 = ROL2(x1, rc) ^ x0; \
+	}
+
+#define TFBIG_MIX8(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \
+		TFBIG_MIX(w0, w1, rc0); \
+		TFBIG_MIX(w2, w3, rc1); \
+		TFBIG_MIX(w4, w5, rc2); \
+		TFBIG_MIX(w6, w7, rc3); \
+	}
+
+#define TFBIG_4e(s)  { \
+		TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+		TFBIG_MIX8(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \
+		TFBIG_MIX8(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \
+		TFBIG_MIX8(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \
+		TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44,  9, 54, 56); \
+	}
+
+#define TFBIG_4o(s)  { \
+		TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+		TFBIG_MIX8(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \
+		TFBIG_MIX8(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \
+		TFBIG_MIX8(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \
+		TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3],  8, 35, 56, 22); \
+	}
+
+#define TFBIG_KINIT_UI2(k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2) { \
+		k8 = ((k0 ^ k1) ^ (k2 ^ k3)) ^ ((k4 ^ k5) ^ (k6 ^ k7)) \
+			^ vectorize(0x1BD11BDAA9FC1A22); \
+		t2 = t0 ^ t1; \
+	}
+
+#define TFBIG_ADDKEY_UI2(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \
+		w0 = (w0 + SKBI(k, s, 0)); \
+		w1 = (w1 + SKBI(k, s, 1)); \
+		w2 = (w2 + SKBI(k, s, 2)); \
+		w3 = (w3 + SKBI(k, s, 3)); \
+		w4 = (w4 + SKBI(k, s, 4)); \
+		w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \
+		w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \
+		w7 = (w7 + SKBI(k, s, 7) + vectorize(s)); \
+	}
+
+#define TFBIG_ADDKEY_PRE(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \
+		w0 = (w0 + SKBI(k, s, 0)); \
+		w1 = (w1 + SKBI(k, s, 1)); \
+		w2 = (w2 + SKBI(k, s, 2)); \
+		w3 = (w3 + SKBI(k, s, 3)); \
+		w4 = (w4 + SKBI(k, s, 4)); \
+		w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \
+		w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \
+		w7 = (w7 + SKBI(k, s, 7) + (s)); \
+	}
+
+#define TFBIG_MIX_UI2(x0, x1, rc) { \
+		x0 = x0 + x1; \
+		x1 = ROL2(x1, rc) ^ x0; \
+	}
+
+#define TFBIG_MIX_PRE(x0, x1, rc) { \
+		x0 = x0 + x1; \
+		x1 = ROTL64(x1, rc) ^ x0; \
+	}
+
+#define TFBIG_MIX8_UI2(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \
+		TFBIG_MIX_UI2(w0, w1, rc0); \
+		TFBIG_MIX_UI2(w2, w3, rc1); \
+		TFBIG_MIX_UI2(w4, w5, rc2); \
+		TFBIG_MIX_UI2(w6, w7, rc3); \
+	}
+
+#define TFBIG_MIX8_PRE(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \
+		TFBIG_MIX_PRE(w0, w1, rc0); \
+		TFBIG_MIX_PRE(w2, w3, rc1); \
+		TFBIG_MIX_PRE(w4, w5, rc2); \
+		TFBIG_MIX_PRE(w6, w7, rc3); \
+	}
+
+#define TFBIG_4e_UI2(s)  { \
+		TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+		TFBIG_MIX8_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \
+		TFBIG_MIX8_UI2(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \
+		TFBIG_MIX8_UI2(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \
+		TFBIG_MIX8_UI2(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44,  9, 54, 56); \
+	}
+
+#define TFBIG_4e_PRE(s)  { \
+		TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+		TFBIG_MIX8_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \
+		TFBIG_MIX8_PRE(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \
+		TFBIG_MIX8_PRE(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \
+		TFBIG_MIX8_PRE(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44,  9, 54, 56); \
+	}
+
+#define TFBIG_4o_UI2(s)  { \
+		TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+		TFBIG_MIX8_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \
+		TFBIG_MIX8_UI2(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \
+		TFBIG_MIX8_UI2(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \
+		TFBIG_MIX8_UI2(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3],  8, 35, 56, 22); \
+	}
+
+#define TFBIG_4o_PRE(s)  { \
+		TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
+		TFBIG_MIX8_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \
+		TFBIG_MIX8_PRE(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \
+		TFBIG_MIX8_PRE(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \
+		TFBIG_MIX8_PRE(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3],  8, 35, 56, 22); \
+	}
+
+#define TFBIGMIX8e(){\
+		p[ 0]+=p[ 1];p[ 2]+=p[ 3];p[ 4]+=p[ 5];p[ 6]+=p[ 7];p[ 1]=ROL2(p[ 1],46) ^ p[ 0];p[ 3]=ROL2(p[ 3],36) ^ p[ 2];p[ 5]=ROL2(p[ 5],19) ^ p[ 4];p[ 7]=ROL2(p[ 7],37) ^ p[ 6];\
+		p[ 2]+=p[ 1];p[ 4]+=p[ 7];p[ 6]+=p[ 5];p[ 0]+=p[ 3];p[ 1]=ROL2(p[ 1],33) ^ p[ 2];p[ 7]=ROL2(p[ 7],27) ^ p[ 4];p[ 5]=ROL2(p[ 5],14) ^ p[ 6];p[ 3]=ROL2(p[ 3],42) ^ p[ 0];\
+		p[ 4]+=p[ 1];p[ 6]+=p[ 3];p[ 0]+=p[ 5];p[ 2]+=p[ 7];p[ 1]=ROL2(p[ 1],17) ^ p[ 4];p[ 3]=ROL2(p[ 3],49) ^ p[ 6];p[ 5]=ROL2(p[ 5],36) ^ p[ 0];p[ 7]=ROL2(p[ 7],39) ^ p[ 2];\
+		p[ 6]+=p[ 1];p[ 0]+=p[ 7];p[ 2]+=p[ 5];p[ 4]+=p[ 3];p[ 1]=ROL2(p[ 1],44) ^ p[ 6];p[ 7]=ROL2(p[ 7], 9) ^ p[ 0];p[ 5]=ROL2(p[ 5],54) ^ p[ 2];p[ 3]=ROR8(p[ 3])    ^ p[ 4];\
+}
+#define TFBIGMIX8o(){\
+		p[ 0]+=p[ 1];p[ 2]+=p[ 3];p[ 4]+=p[ 5];p[ 6]+=p[ 7];p[ 1]=ROL2(p[ 1],39) ^ p[ 0];p[ 3]=ROL2(p[ 3],30) ^ p[ 2];p[ 5]=ROL2(p[ 5],34) ^ p[ 4];p[ 7]=ROL24(p[ 7])   ^ p[ 6];\
+		p[ 2]+=p[ 1];p[ 4]+=p[ 7];p[ 6]+=p[ 5];p[ 0]+=p[ 3];p[ 1]=ROL2(p[ 1],13) ^ p[ 2];p[ 7]=ROL2(p[ 7],50) ^ p[ 4];p[ 5]=ROL2(p[ 5],10) ^ p[ 6];p[ 3]=ROL2(p[ 3],17) ^ p[ 0];\
+		p[ 4]+=p[ 1];p[ 6]+=p[ 3];p[ 0]+=p[ 5];p[ 2]+=p[ 7];p[ 1]=ROL2(p[ 1],25) ^ p[ 4];p[ 3]=ROL2(p[ 3],29) ^ p[ 6];p[ 5]=ROL2(p[ 5],39) ^ p[ 0];p[ 7]=ROL2(p[ 7],43) ^ p[ 2];\
+		p[ 6]+=p[ 1];p[ 0]+=p[ 7];p[ 2]+=p[ 5];p[ 4]+=p[ 3];p[ 1]=ROL8(p[ 1])    ^ p[ 6];p[ 7]=ROL2(p[ 7],35) ^ p[ 0];p[ 5]=ROR8(p[ 5])    ^ p[ 2];p[ 3]=ROL2(p[ 3],22) ^ p[ 4];\
+}
+
+#define addwBuff(x0,x1,x2,x3,x4){\
+	p[ 0]+=h[x0];\
+	p[ 1]+=h[x1];\
+	p[ 2]+=h[x2];\
+	p[ 3]+=h[x3];\
+	p[ 4]+=h[x4];\
+	p[ 5]+=c_buffer[i++];\
+	p[ 7]+=c_buffer[i++];\
+	p[ 6]+=c_buffer[i];\
+}
+
+#define addwCon(x0,x1,x2,x3,x4,x5,x6,x7,y0,y1,y2){\
+	p[ 0]+= h[x0];\
+	p[ 1]+= h[x1];\
+	p[ 2]+= h[x2];\
+	p[ 3]+= h[x3];\
+	p[ 4]+= h[x4];\
+	p[ 5]+= h[x5] + c_t[y0];\
+	p[ 6]+= h[x6] + c_t[y1];\
+	p[ 7]+= h[x7] + c_add[y2];\
+}
+
+
diff --git a/streebog/sib.cu b/streebog/sib.cu
index 24de940b7f..31176d1523 100644
--- a/streebog/sib.cu
+++ b/streebog/sib.cu
@@ -1,15 +1,9 @@
 extern "C" {
-#include "sph/sph_blake.h"
-#include "sph/sph_bmw.h"
-#include "sph/sph_groestl.h"
 #include "sph/sph_skein.h"
-#include "sph/sph_jh.h"
-#include "sph/sph_keccak.h"
 #include "sph/sph_luffa.h"
-#include "sph/sph_cubehash.h"
-#include "sph/sph_shavite.h"
-#include "sph/sph_simd.h"
 #include "sph/sph_echo.h"
+#include "sph/sph_fugue.h"
+#include "sph/sph_shabal.h"
 #include "sph/sph_streebog.h"
 }
 
@@ -17,94 +11,76 @@ extern "C" {
 #include "cuda_helper.h"
 #include "x11/cuda_x11.h"
 
-extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
-extern void keccak_streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
-extern void keccak_streebog_luffa_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
-
 #include <stdio.h>
 #include <memory.h>
 
 #define NBN 2
+
 static uint32_t *d_hash[MAX_GPUS];
 static uint32_t *d_resNonce[MAX_GPUS];
 static uint32_t *h_resNonce[MAX_GPUS];
 
+extern void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash,uint32_t* d_resNonce);
+extern void streebog_set_target(const uint32_t* ptarget);
+
+extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
+//extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads,uint32_t *d_hash);
+extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);
+extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
+
+extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads,uint32_t *d_hash);
+extern void x11_luffa512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
+
+extern void skein512_cpu_setBlock_80(void *pdata);
+extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+
 // Sibcoin CPU Hash
-extern "C" void sibhash(void *output, const void *input)
+extern "C" void polyhash(void *output, const void *input)
 {
 	unsigned char _ALIGN(128) hash[128] = { 0 };
 
-	sph_blake512_context ctx_blake;
-	sph_bmw512_context ctx_bmw;
-	sph_groestl512_context ctx_groestl;
 	sph_skein512_context ctx_skein;
-	sph_jh512_context ctx_jh;
-	sph_keccak512_context ctx_keccak;
-	sph_gost512_context ctx_gost;
-	sph_luffa512_context ctx_luffa;
-	sph_cubehash512_context ctx_cubehash;
-	sph_shavite512_context ctx_shavite;
-	sph_simd512_context ctx_simd;
+	sph_shabal512_context    ctx_shabal;	
 	sph_echo512_context ctx_echo;
+	sph_luffa512_context ctx_luffa;
+	sph_fugue512_context ctx_fugue;
+	sph_gost512_context ctx_gost;
 
-	sph_blake512_init(&ctx_blake);
-	sph_blake512 (&ctx_blake, input, 80);
-	sph_blake512_close(&ctx_blake, (void*) hash);
 
-	sph_bmw512_init(&ctx_bmw);
-	sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
-	sph_bmw512_close(&ctx_bmw, (void*) hash);
+        sph_skein512_init(&ctx_skein);
+        sph_skein512 (&ctx_skein, input, 80);
+        sph_skein512_close(&ctx_skein, (void*) hash);
 
-	sph_groestl512_init(&ctx_groestl);
-	sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
-	sph_groestl512_close(&ctx_groestl, (void*) hash);
+        sph_shabal512_init(&ctx_shabal);
+        sph_shabal512 (&ctx_shabal, (const void*) hash, 64);
+        sph_shabal512_close(&ctx_shabal, (void*) hash);
 
-	sph_skein512_init(&ctx_skein);
-	sph_skein512 (&ctx_skein, (const void*) hash, 64);
-	sph_skein512_close(&ctx_skein, (void*) hash);
+        sph_echo512_init(&ctx_echo);
+        sph_echo512 (&ctx_echo, (const void*) hash, 64);
+        sph_echo512_close(&ctx_echo, (void*) hash);
 
-	sph_jh512_init(&ctx_jh);
-	sph_jh512 (&ctx_jh, (const void*) hash, 64);
-	sph_jh512_close(&ctx_jh, (void*) hash);
+        sph_luffa512_init(&ctx_luffa);
+        sph_luffa512 (&ctx_luffa, (const void*) hash, 64);
+        sph_luffa512_close (&ctx_luffa, (void*) hash);
 
-	sph_keccak512_init(&ctx_keccak);
-	sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
-	sph_keccak512_close(&ctx_keccak, (void*) hash);
+        sph_fugue512_init(&ctx_fugue);
+        sph_fugue512(&ctx_fugue, (const void*) hash, 64);
+        sph_fugue512_close(&ctx_fugue, (void*) hash);
 
 	sph_gost512_init(&ctx_gost);
 	sph_gost512(&ctx_gost, (const void*) hash, 64);
 	sph_gost512_close(&ctx_gost, (void*) hash);
 
-	sph_luffa512_init(&ctx_luffa);
-	sph_luffa512 (&ctx_luffa, (const void*) hash, 64);
-	sph_luffa512_close (&ctx_luffa, (void*) hash);
-
-	sph_cubehash512_init(&ctx_cubehash);
-	sph_cubehash512 (&ctx_cubehash, (const void*) hash, 64);
-	sph_cubehash512_close(&ctx_cubehash, (void*) hash);
-
-	sph_shavite512_init(&ctx_shavite);
-	sph_shavite512 (&ctx_shavite, (const void*) hash, 64);
-	sph_shavite512_close(&ctx_shavite, (void*) hash);
-
-	sph_simd512_init(&ctx_simd);
-	sph_simd512 (&ctx_simd, (const void*) hash, 64);
-	sph_simd512_close(&ctx_simd, (void*) hash);
-
-	sph_echo512_init(&ctx_echo);
-	sph_echo512 (&ctx_echo, (const void*) hash, 64);
-	sph_echo512_close(&ctx_echo, (void*) hash);
-
 	memcpy(output, hash, 32);
 }
 
 //#define _DEBUG
-#define _DEBUG_PREFIX "sib"
+#define _DEBUG_PREFIX "poly"
 #include "cuda_debug.cuh"
 
 static bool init[MAX_GPUS] = { 0 };
 
-extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+extern "C" int scanhash_poly(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
 
 	int dev_id = device_map[thr_id];
@@ -122,10 +98,8 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
 	uint32_t throughput = cuda_default_throughput(thr_id, default_throughput); // 19=256*256*8;
 	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
 
-	throughput&=0xFFFFFF70; //multiples of 128 due to keccak_streebog_luffa
-	
-	if (opt_benchmark)
-		ptarget[7] = 0xf;
+        if (opt_benchmark)
+                ptarget[7] = 0xf;
 
 	if (!init[thr_id])
 	{
@@ -139,8 +113,6 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
 
 		gpulog(LOG_INFO,thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
-		x11_simd_echo_512_cpu_init(thr_id, throughput);
-
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 8 * sizeof(uint64_t) * throughput));
 		CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t)));
 		h_resNonce[thr_id] = (uint32_t*) malloc(NBN * sizeof(uint32_t));
@@ -148,34 +120,28 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
 			gpulog(LOG_ERR,thr_id,"Host memory allocation failed");
 			exit(EXIT_FAILURE);
 		}
-		
+		x13_fugue512_cpu_init(thr_id, throughput);	
 		init[thr_id] = true;
 	}
-
+	
 	uint32_t endiandata[20];
 	for (int k=0; k < 20; k++)
 		be32enc(&endiandata[k], pdata[k]);
 
-	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
-
+        skein512_cpu_setBlock_80(endiandata);
 	cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
+       	streebog_set_target(ptarget);
 
 	do {
 		// Hash with CUDA
-		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		quark_bmw512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]);
-		quark_groestl512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]);
-		quark_skein512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]);
-		quark_jh512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]);
-		
-//		quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]);
-//		streebog_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);		
-		keccak_streebog_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+
+		skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		x14_shabal512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+		x11_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
 
 		x11_luffa512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
-		x11_cubehash_shavite512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
-//		x11_luffaCubehashShavite512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
-		x11_simd_echo512_cpu_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id], *(uint64_t*)&ptarget[6]);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id]);
+		streebog_cpu_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
 
 		cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost);
 
@@ -184,9 +150,9 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
 			const uint32_t startNounce = pdata[19];
 			uint32_t vhash64[8];
 			be32enc(&endiandata[19], startNounce + h_resNonce[thr_id][0]);
-			sibhash(vhash64, endiandata);
+			polyhash(vhash64, endiandata);
 
-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+			if (vhash64[7] <= Htarg) {
 				int res = 1;
 				*hashes_done = pdata[19] - first_nonce + throughput;
 				work_set_target_ratio(work, vhash64);
@@ -196,7 +162,7 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
 //					if(!opt_quiet)
 //						gpulog(LOG_BLUE,dev_id,"Found 2nd nonce: %08x", pdata[21]);
 					be32enc(&endiandata[19], startNounce+h_resNonce[thr_id][1]);
-					sibhash(vhash64, endiandata);
+					polyhash(vhash64, endiandata);
 					if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio[0]){
 						work_set_target_ratio(work, vhash64);
 						xchg(pdata[19],pdata[21]);
@@ -221,7 +187,7 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
 }
 
 // cleanup
-extern "C" void free_sib(int thr_id)
+extern "C" void free_poly(int thr_id)
 {
 	if (!init[thr_id])
 		return;
@@ -231,8 +197,6 @@ extern "C" void free_sib(int thr_id)
 	free(h_resNonce[thr_id]);
 	cudaFree(d_resNonce[thr_id]);
 	cudaFree(d_hash[thr_id]);
-	
-	x11_simd_echo_512_cpu_free(thr_id);
 
 	init[thr_id] = false;
 
diff --git a/streebog/veltor.cu b/streebog/veltor.cu
index c288b4f3d7..ea1bd1c76d 100644
--- a/streebog/veltor.cu
+++ b/streebog/veltor.cu
@@ -95,14 +95,14 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce
 		init[thr_id] = true;
 	}
 
-	uint32_t endiandata[20];
-	for (int k=0; k < 20; k++)
-		be32enc(&endiandata[k], pdata[k]);
+        uint32_t endiandata[20];
+        for (int k=0; k < 20; k++)
+                be32enc(&endiandata[k], pdata[k]);
 
-	skein512_cpu_setBlock_80(endiandata);
+        skein512_cpu_setBlock_80(endiandata);
 
-	cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
-	streebog_set_target(ptarget);
+        cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
+        streebog_set_target(ptarget);
 	
 	do {
 		skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
diff --git a/util.cpp b/util.cpp
index 146e89d9a1..5ef1ff358a 100644
--- a/util.cpp
+++ b/util.cpp
@@ -2182,7 +2182,7 @@ void print_hash_tests(void){
 	c11hash(&hash[0], &buf[0]);
 	printpfx("c11", hash);
 
-	sibhash(&hash[0], &buf[0]);
+	polyhash(&hash[0], &buf[0]);
 	printpfx("sib", hash);
 	
 	x13hash(&hash[0], &buf[0]);
diff --git a/x13/cuda_x13_fugue512.cu b/x13/cuda_x13_fugue512.cu
index b3d708a226..43bb744a15 100644
--- a/x13/cuda_x13_fugue512.cu
+++ b/x13/cuda_x13_fugue512.cu
@@ -1,32 +1,26 @@
+
+#include <cuda_helper.h>
+
+#define TPB 256
+
 /*
- * Quick and dirty addition of Fugue-512 for X13
- * 
- * Built on cbuchner1's implementation, actual hashing code
- * heavily based on phm's sgminer
- *
- * 
- */
-#include "cuda_helper.h"
-#include "miner.h"
-#include "cuda_vectors.h"
-/*
- * X13 kernel implementation.
+ * fugue512 x13 kernel implementation.
  *
  * ==========================(LICENSE BEGIN)============================
  *
- * Copyright (c) 2014-2016  phm, Provos Alexis
- * 
+ * Copyright (c) 2014-2017 phm, tpruvot
+ *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files (the
- * "Software", to deal in the Software without restriction, including
+ * "Software"), to deal in the Software without restriction, including
  * without limitation the rights to use, copy, modify, merge, publish,
  * distribute, sublicense, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice shall be
  * included in all copies or substantial portions of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@@ -36,373 +30,356 @@
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  *
  * ===========================(LICENSE END)=============================
- *
- * @author   phm <phm@inbox.com>
- * @author   Provos Alexis (Applied partial shared Mem utilization under CUDA 7.5 for compute5.0/5.2 / 2016)
  */
 
-static __constant__ const uint32_t c_S[16] = {
-		0x8807a57e, 0xe616af75, 0xc5d3e4db, 0xac9ab027,
-		0xd915f117, 0xb6eecc54, 0x06e8020b, 0x4a92efd1,
-		0xaac6e2c9, 0xddb21398, 0xcae65838, 0x437f203f,
-		0x25ea78e7, 0x951fddd6, 0xda6ed11d, 0xe13e3567
+#ifdef __INTELLISENSE__
+#define __byte_perm(x, y, m) (x|y)
+#define tex1Dfetch(t, n) (n)
+#define __CUDACC__
+#include <cuda_texture_types.h>
+#endif
+
+// store allocated textures device addresses
+static unsigned int* d_textures[MAX_GPUS][1];
+
+#define mixtab0(x) mixtabs[(x)]
+#define mixtab1(x) mixtabs[(x)+256]
+#define mixtab2(x) mixtabs[(x)+512]
+#define mixtab3(x) mixtabs[(x)+768]
+
+static texture<unsigned int, 1, cudaReadModeElementType> mixTab0Tex;
+
+static const uint32_t mixtab0[] = {
+	0x63633297, 0x7c7c6feb, 0x77775ec7, 0x7b7b7af7, 0xf2f2e8e5, 0x6b6b0ab7, 0x6f6f16a7, 0xc5c56d39,
+	0x303090c0, 0x01010704, 0x67672e87, 0x2b2bd1ac, 0xfefeccd5, 0xd7d71371, 0xabab7c9a, 0x767659c3,
+	0xcaca4005, 0x8282a33e, 0xc9c94909, 0x7d7d68ef, 0xfafad0c5, 0x5959947f, 0x4747ce07, 0xf0f0e6ed,
+	0xadad6e82, 0xd4d41a7d, 0xa2a243be, 0xafaf608a, 0x9c9cf946, 0xa4a451a6, 0x727245d3, 0xc0c0762d,
+	0xb7b728ea, 0xfdfdc5d9, 0x9393d47a, 0x2626f298, 0x363682d8, 0x3f3fbdfc, 0xf7f7f3f1, 0xcccc521d,
+	0x34348cd0, 0xa5a556a2, 0xe5e58db9, 0xf1f1e1e9, 0x71714cdf, 0xd8d83e4d, 0x313197c4, 0x15156b54,
+	0x04041c10, 0xc7c76331, 0x2323e98c, 0xc3c37f21, 0x18184860, 0x9696cf6e, 0x05051b14, 0x9a9aeb5e,
+	0x0707151c, 0x12127e48, 0x8080ad36, 0xe2e298a5, 0xebeba781, 0x2727f59c, 0xb2b233fe, 0x757550cf,
+	0x09093f24, 0x8383a43a, 0x2c2cc4b0, 0x1a1a4668, 0x1b1b416c, 0x6e6e11a3, 0x5a5a9d73, 0xa0a04db6,
+	0x5252a553, 0x3b3ba1ec, 0xd6d61475, 0xb3b334fa, 0x2929dfa4, 0xe3e39fa1, 0x2f2fcdbc, 0x8484b126,
+	0x5353a257, 0xd1d10169, 0x00000000, 0xededb599, 0x2020e080, 0xfcfcc2dd, 0xb1b13af2, 0x5b5b9a77,
+	0x6a6a0db3, 0xcbcb4701, 0xbebe17ce, 0x3939afe4, 0x4a4aed33, 0x4c4cff2b, 0x5858937b, 0xcfcf5b11,
+	0xd0d0066d, 0xefefbb91, 0xaaaa7b9e, 0xfbfbd7c1, 0x4343d217, 0x4d4df82f, 0x333399cc, 0x8585b622,
+	0x4545c00f, 0xf9f9d9c9, 0x02020e08, 0x7f7f66e7, 0x5050ab5b, 0x3c3cb4f0, 0x9f9ff04a, 0xa8a87596,
+	0x5151ac5f, 0xa3a344ba, 0x4040db1b, 0x8f8f800a, 0x9292d37e, 0x9d9dfe42, 0x3838a8e0, 0xf5f5fdf9,
+	0xbcbc19c6, 0xb6b62fee, 0xdada3045, 0x2121e784, 0x10107040, 0xffffcbd1, 0xf3f3efe1, 0xd2d20865,
+	0xcdcd5519, 0x0c0c2430, 0x1313794c, 0xececb29d, 0x5f5f8667, 0x9797c86a, 0x4444c70b, 0x1717655c,
+	0xc4c46a3d, 0xa7a758aa, 0x7e7e61e3, 0x3d3db3f4, 0x6464278b, 0x5d5d886f, 0x19194f64, 0x737342d7,
+	0x60603b9b, 0x8181aa32, 0x4f4ff627, 0xdcdc225d, 0x2222ee88, 0x2a2ad6a8, 0x9090dd76, 0x88889516,
+	0x4646c903, 0xeeeebc95, 0xb8b805d6, 0x14146c50, 0xdede2c55, 0x5e5e8163, 0x0b0b312c, 0xdbdb3741,
+	0xe0e096ad, 0x32329ec8, 0x3a3aa6e8, 0x0a0a3628, 0x4949e43f, 0x06061218, 0x2424fc90, 0x5c5c8f6b,
+	0xc2c27825, 0xd3d30f61, 0xacac6986, 0x62623593, 0x9191da72, 0x9595c662, 0xe4e48abd, 0x797974ff,
+	0xe7e783b1, 0xc8c84e0d, 0x373785dc, 0x6d6d18af, 0x8d8d8e02, 0xd5d51d79, 0x4e4ef123, 0xa9a97292,
+	0x6c6c1fab, 0x5656b943, 0xf4f4fafd, 0xeaeaa085, 0x6565208f, 0x7a7a7df3, 0xaeae678e, 0x08083820,
+	0xbaba0bde, 0x787873fb, 0x2525fb94, 0x2e2ecab8, 0x1c1c5470, 0xa6a65fae, 0xb4b421e6, 0xc6c66435,
+	0xe8e8ae8d, 0xdddd2559, 0x747457cb, 0x1f1f5d7c, 0x4b4bea37, 0xbdbd1ec2, 0x8b8b9c1a, 0x8a8a9b1e,
+	0x70704bdb, 0x3e3ebaf8, 0xb5b526e2, 0x66662983, 0x4848e33b, 0x0303090c, 0xf6f6f4f5, 0x0e0e2a38,
+	0x61613c9f, 0x35358bd4, 0x5757be47, 0xb9b902d2, 0x8686bf2e, 0xc1c17129, 0x1d1d5374, 0x9e9ef74e,
+	0xe1e191a9, 0xf8f8decd, 0x9898e556, 0x11117744, 0x696904bf, 0xd9d93949, 0x8e8e870e, 0x9494c166,
+	0x9b9bec5a, 0x1e1e5a78, 0x8787b82a, 0xe9e9a989, 0xcece5c15, 0x5555b04f, 0x2828d8a0, 0xdfdf2b51,
+	0x8c8c8906, 0xa1a14ab2, 0x89899212, 0x0d0d2334, 0xbfbf10ca, 0xe6e684b5, 0x4242d513, 0x686803bb,
+	0x4141dc1f, 0x9999e252, 0x2d2dc3b4, 0x0f0f2d3c, 0xb0b03df6, 0x5454b74b, 0xbbbb0cda, 0x16166258
 };
 
-static __device__ uint32_t mixtab0[256] = {
-	0x63633297, 0x7c7c6feb, 0x77775ec7, 0x7b7b7af7, 0xf2f2e8e5, 0x6b6b0ab7,	0x6f6f16a7, 0xc5c56d39, 0x303090c0, 0x01010704, 0x67672e87, 0x2b2bd1ac, 0xfefeccd5, 0xd7d71371, 0xabab7c9a,
-	0x767659c3, 0xcaca4005, 0x8282a33e, 0xc9c94909, 0x7d7d68ef, 0xfafad0c5,	0x5959947f, 0x4747ce07, 0xf0f0e6ed, 0xadad6e82, 0xd4d41a7d, 0xa2a243be, 0xafaf608a, 0x9c9cf946, 0xa4a451a6,
-	0x727245d3, 0xc0c0762d, 0xb7b728ea, 0xfdfdc5d9, 0x9393d47a, 0x2626f298, 0x363682d8, 0x3f3fbdfc, 0xf7f7f3f1, 0xcccc521d, 0x34348cd0, 0xa5a556a2, 0xe5e58db9, 0xf1f1e1e9, 0x71714cdf,
-	0xd8d83e4d, 0x313197c4, 0x15156b54, 0x04041c10, 0xc7c76331, 0x2323e98c, 0xc3c37f21, 0x18184860, 0x9696cf6e, 0x05051b14, 0x9a9aeb5e, 0x0707151c, 0x12127e48, 0x8080ad36, 0xe2e298a5,
-	0xebeba781, 0x2727f59c, 0xb2b233fe, 0x757550cf, 0x09093f24, 0x8383a43a, 0x2c2cc4b0, 0x1a1a4668, 0x1b1b416c, 0x6e6e11a3, 0x5a5a9d73, 0xa0a04db6, 0x5252a553, 0x3b3ba1ec, 0xd6d61475,
-	0xb3b334fa, 0x2929dfa4, 0xe3e39fa1, 0x2f2fcdbc, 0x8484b126, 0x5353a257, 0xd1d10169, 0x00000000, 0xededb599, 0x2020e080, 0xfcfcc2dd, 0xb1b13af2, 0x5b5b9a77, 0x6a6a0db3, 0xcbcb4701,
-	0xbebe17ce, 0x3939afe4, 0x4a4aed33, 0x4c4cff2b, 0x5858937b, 0xcfcf5b11, 0xd0d0066d, 0xefefbb91, 0xaaaa7b9e, 0xfbfbd7c1, 0x4343d217, 0x4d4df82f, 0x333399cc, 0x8585b622, 0x4545c00f,
-	0xf9f9d9c9, 0x02020e08, 0x7f7f66e7, 0x5050ab5b, 0x3c3cb4f0, 0x9f9ff04a, 0xa8a87596, 0x5151ac5f, 0xa3a344ba, 0x4040db1b, 0x8f8f800a, 0x9292d37e, 0x9d9dfe42, 0x3838a8e0, 0xf5f5fdf9,
-	0xbcbc19c6, 0xb6b62fee, 0xdada3045, 0x2121e784, 0x10107040, 0xffffcbd1, 0xf3f3efe1, 0xd2d20865, 0xcdcd5519, 0x0c0c2430, 0x1313794c, 0xececb29d, 0x5f5f8667, 0x9797c86a, 0x4444c70b,
-	0x1717655c, 0xc4c46a3d, 0xa7a758aa, 0x7e7e61e3, 0x3d3db3f4, 0x6464278b, 0x5d5d886f, 0x19194f64, 0x737342d7, 0x60603b9b, 0x8181aa32, 0x4f4ff627, 0xdcdc225d, 0x2222ee88, 0x2a2ad6a8,
-	0x9090dd76, 0x88889516, 0x4646c903, 0xeeeebc95, 0xb8b805d6, 0x14146c50, 0xdede2c55, 0x5e5e8163, 0x0b0b312c, 0xdbdb3741, 0xe0e096ad, 0x32329ec8, 0x3a3aa6e8, 0x0a0a3628, 0x4949e43f,
-	0x06061218, 0x2424fc90, 0x5c5c8f6b, 0xc2c27825, 0xd3d30f61, 0xacac6986, 0x62623593, 0x9191da72, 0x9595c662, 0xe4e48abd, 0x797974ff, 0xe7e783b1, 0xc8c84e0d, 0x373785dc, 0x6d6d18af,
-	0x8d8d8e02, 0xd5d51d79, 0x4e4ef123, 0xa9a97292, 0x6c6c1fab, 0x5656b943, 0xf4f4fafd, 0xeaeaa085, 0x6565208f, 0x7a7a7df3, 0xaeae678e, 0x08083820, 0xbaba0bde, 0x787873fb, 0x2525fb94,
-	0x2e2ecab8, 0x1c1c5470, 0xa6a65fae, 0xb4b421e6, 0xc6c66435, 0xe8e8ae8d, 0xdddd2559, 0x747457cb, 0x1f1f5d7c, 0x4b4bea37, 0xbdbd1ec2, 0x8b8b9c1a, 0x8a8a9b1e, 0x70704bdb, 0x3e3ebaf8,
-	0xb5b526e2, 0x66662983, 0x4848e33b, 0x0303090c, 0xf6f6f4f5, 0x0e0e2a38, 0x61613c9f, 0x35358bd4, 0x5757be47, 0xb9b902d2, 0x8686bf2e, 0xc1c17129, 0x1d1d5374, 0x9e9ef74e, 0xe1e191a9,
-	0xf8f8decd, 0x9898e556, 0x11117744, 0x696904bf, 0xd9d93949, 0x8e8e870e, 0x9494c166, 0x9b9bec5a, 0x1e1e5a78, 0x8787b82a, 0xe9e9a989, 0xcece5c15, 0x5555b04f, 0x2828d8a0, 0xdfdf2b51,
-	0x8c8c8906, 0xa1a14ab2, 0x89899212, 0x0d0d2334, 0xbfbf10ca, 0xe6e684b5, 0x4242d513, 0x686803bb, 0x4141dc1f, 0x9999e252, 0x2d2dc3b4, 0x0f0f2d3c, 0xb0b03df6, 0x5454b74b, 0xbbbb0cda,
-	0x16166258
-};
-
-#define mixtab0(x) shared[0][x]
-#define mixtab1(x) shared[1][x]
-#define mixtab2(x) shared[2][x]
-#define mixtab3(x) shared[3][x]
-
 #define TIX4(q, x00, x01, x04, x07, x08, x22, x24, x27, x30) { \
-		x22 ^= x00; \
-		x00 = (q); \
-		x08 ^= (q); \
-		x01 ^= x24; \
-		x04 ^= x27; \
-		x07 ^= x30; \
-	}
+	x22 ^= x00; \
+	x00 = (q); \
+	x08 ^= x00; \
+	x01 ^= x24; \
+	x04 ^= x27; \
+	x07 ^= x30; \
+}
 
 #define CMIX36(x00, x01, x02, x04, x05, x06, x18, x19, x20) { \
-		x00 ^= x04; \
-		x01 ^= x05; \
-		x02 ^= x06; \
-		x18 ^= x04; \
-		x19 ^= x05; \
-		x20 ^= x06; \
-	}
+	x00 ^= x04; \
+	x01 ^= x05; \
+	x02 ^= x06; \
+	x18 ^= x04; \
+	x19 ^= x05; \
+	x20 ^= x06; \
+}
 
-__device__ __forceinline__
-static void SMIX(const uint32_t shared[4][256], uint32_t &x0,uint32_t &x1,uint32_t &x2,uint32_t &x3){
-	uint32_t c0 = mixtab0(__byte_perm(x0,0,0x4443));
-	uint32_t r1 = mixtab1(__byte_perm(x0,0,0x4442));
-	uint32_t r2 = mixtab2(__byte_perm(x0,0,0x4441));
-	uint32_t r3 = mixtab3(__byte_perm(x0,0,0x4440));
-	c0 = c0 ^ r1 ^ r2 ^ r3;
-	uint32_t r0 = mixtab0(__byte_perm(x1,0,0x4443));
-	uint32_t c1 = r0 ^ mixtab1(__byte_perm(x1,0,0x4442));
-	uint32_t tmp = mixtab2(__byte_perm(x1,0,0x4441));
-	c1 ^= tmp;
-	r2 ^= tmp;
-	tmp = mixtab3(__byte_perm(x1,0,0x4440));
-	c1 ^= tmp;
-	r3 ^= tmp;
-	uint32_t c2 = mixtab0(__byte_perm(x2,0,0x4443));
-	r0 ^= c2;
-	tmp = mixtab1(__byte_perm(x2,0,0x4442));
-	c2 ^= tmp;
-	r1 ^= tmp;
-	tmp = mixtab2(__byte_perm(x2,0,0x4441));
-	c2 ^= tmp;
-	tmp = mixtab3(__byte_perm(x2,0,0x4440));
-	c2 ^= tmp;
-	r3 ^= tmp;
-	uint32_t c3 = mixtab0(__byte_perm(x3,0,0x4443));
-	r0 ^= c3;
-	tmp = mixtab1(__byte_perm(x3,0,0x4442));
-	c3 ^= tmp;
-	r1 ^= tmp;
-	tmp = mixtab2(__byte_perm(x3,0,0x4441));
-	c3 ^= tmp;
-	r2 ^= tmp;
-	tmp = mixtab3(__byte_perm(x3,0,0x4440));
-	c3 ^= tmp;
-	x0 = ((c0 ^ (r0 << 0)) & 0xFF000000) | ((c1 ^ (r1 << 0)) & 0x00FF0000) | ((c2 ^ (r2 << 0)) & 0x0000FF00) | ((c3 ^ (r3 << 0)) & 0x000000FF);
-	x1 = ((c1 ^ (r0 << 8)) & 0xFF000000) | ((c2 ^ (r1 << 8)) & 0x00FF0000) | ((c3 ^ (r2 << 8)) & 0x0000FF00) | ((c0 ^ (r3 >>24)) & 0x000000FF);
-	x2 = ((c2 ^ (r0 <<16)) & 0xFF000000) | ((c3 ^ (r1 <<16)) & 0x00FF0000) | ((c0 ^ (r2 >>16)) & 0x0000FF00) | ((c1 ^ (r3 >>16)) & 0x000000FF);
-	x3 = ((c3 ^ (r0 <<24)) & 0xFF000000) | ((c0 ^ (r1 >> 8)) & 0x00FF0000) | ((c1 ^ (r2 >> 8)) & 0x0000FF00) | ((c2 ^ (r3 >> 8)) & 0x000000FF);
+#define SMIX(x0, x1, x2, x3) { \
+	uint32_t tmp; \
+	uint32_t r0 = 0; \
+	uint32_t r1 = 0; \
+	uint32_t r2 = 0; \
+	uint32_t r3 = 0; \
+	uint32_t c0 = mixtab0(x0 >> 24); \
+	tmp = mixtab1((x0 >> 16) & 0xFF); \
+	c0 ^= tmp; \
+	r1 ^= tmp; \
+	tmp = mixtab2((x0 >>  8) & 0xFF); \
+	c0 ^= tmp; \
+	r2 ^= tmp; \
+	tmp = mixtab3(x0 & 0xFF); \
+	c0 ^= tmp; \
+	r3 ^= tmp; \
+	tmp = mixtab0(x1 >> 24); \
+	uint32_t c1 = tmp; \
+	r0 ^= tmp; \
+	tmp = mixtab1((x1 >> 16) & 0xFF); \
+	c1 ^= tmp; \
+	tmp = mixtab2((x1 >>  8) & 0xFF); \
+	c1 ^= tmp; \
+	r2 ^= tmp; \
+	tmp = mixtab3(x1 & 0xFF); \
+	c1 ^= tmp; \
+	r3 ^= tmp; \
+	tmp = mixtab0(x2 >> 24); \
+	uint32_t c2 = tmp; \
+	r0 ^= tmp; \
+	tmp = mixtab1((x2 >> 16) & 0xFF); \
+	c2 ^= tmp; \
+	r1 ^= tmp; \
+	tmp = mixtab2((x2 >>  8) & 0xFF); \
+	c2 ^= tmp; \
+	tmp = mixtab3(x2 & 0xFF); \
+	c2 ^= tmp; \
+	r3 ^= tmp; \
+	tmp = mixtab0(x3 >> 24); \
+	uint32_t c3 = tmp; \
+	r0 ^= tmp; \
+	tmp = mixtab1((x3 >> 16) & 0xFF); \
+	c3 ^= tmp; \
+	r1 ^= tmp; \
+	tmp = mixtab2((x3 >>  8) & 0xFF); \
+	c3 ^= tmp; \
+	r2 ^= tmp; \
+	tmp = mixtab3(x3 & 0xFF); \
+	c3 ^= tmp; \
+	x0 = ((c0 ^ r0) & 0xFF000000) | ((c1 ^ r1) & 0x00FF0000) \
+		| ((c2 ^ r2) & 0x0000FF00) | ((c3 ^ r3) & 0x000000FF); \
+	x1 = ((c1 ^ (r0 <<  8)) & 0xFF000000) | ((c2 ^ (r1 <<  8)) & 0x00FF0000) \
+		| ((c3 ^ (r2 <<  8)) & 0x0000FF00) | ((c0 ^ (r3 >> 24)) & 0x000000FF); \
+	x2 = ((c2 ^ (r0 << 16)) & 0xFF000000) | ((c3 ^ (r1 << 16)) & 0x00FF0000) \
+		| ((c0 ^ (r2 >> 16)) & 0x0000FF00) | ((c1 ^ (r3 >> 16)) & 0x000000FF); \
+	x3 = ((c3 ^ (r0 << 24)) & 0xFF000000) | ((c0 ^ (r1 >>  8)) & 0x00FF0000) \
+		| ((c1 ^ (r2 >>  8)) & 0x0000FF00) | ((c2 ^ (r3 >>  8)) & 0x000000FF); \
 }
 
-__device__
-static void SMIX_LDG(const uint32_t shared[4][256], uint32_t &x0,uint32_t &x1,uint32_t &x2,uint32_t &x3){
-	uint32_t c0 = __ldg(&mixtab0[__byte_perm(x0,0,0x4443)]);
-	uint32_t r1 = mixtab1(__byte_perm(x0,0,0x4442));
-	uint32_t r2 = mixtab2(__byte_perm(x0,0,0x4441));
-	uint32_t r3 = mixtab3(__byte_perm(x0,0,0x4440));
-	c0 = c0 ^ r1 ^ r2 ^ r3;
-	uint32_t r0 = __ldg(&mixtab0[__byte_perm(x1,0,0x4443)]);
-	uint32_t c1 = r0 ^ mixtab1(__byte_perm(x1,0,0x4442));
-	uint32_t tmp = mixtab2(__byte_perm(x1,0,0x4441));
-	c1 ^= tmp;
-	r2 ^= tmp;
-	tmp = mixtab3(__byte_perm(x1,0,0x4440));
-	c1 ^= tmp;
-	r3 ^= tmp;
-	uint32_t c2 = __ldg(&mixtab0[__byte_perm(x2,0,0x4443)]);
-	r0 ^= c2;
-	tmp = mixtab1(__byte_perm(x2,0,0x4442));
-	c2 ^= tmp;
-	r1 ^= tmp;
-	tmp = mixtab2(__byte_perm(x2,0,0x4441));
-	c2 ^= tmp;
-	tmp = mixtab3(__byte_perm(x2,0,0x4440));
-	c2 ^= tmp;
-	r3 ^= tmp;
-	uint32_t c3 = __ldg(&mixtab0[__byte_perm(x3,0,0x4443)]);
-	r0 ^= c3;
-	tmp = mixtab1(__byte_perm(x3,0,0x4442));
-	c3 ^= tmp;
-	r1 ^= tmp;
-	tmp = mixtab2(__byte_perm(x3,0,0x4441));
-	c3 ^= tmp;
-	r2 ^= tmp;
-	tmp = ROL8(__ldg(&mixtab0[__byte_perm(x3,0,0x4440)]));
-	c3 ^= tmp;
-	x0 = ((c0 ^ (r0 << 0)) & 0xFF000000) | ((c1 ^ (r1 << 0)) & 0x00FF0000) | ((c2 ^ (r2 << 0)) & 0x0000FF00) | ((c3 ^ (r3 << 0)) & 0x000000FF);
-	x1 = ((c1 ^ (r0 << 8)) & 0xFF000000) | ((c2 ^ (r1 << 8)) & 0x00FF0000) | ((c3 ^ (r2 << 8)) & 0x0000FF00) | ((c0 ^ (r3 >>24)) & 0x000000FF);
-	x2 = ((c2 ^ (r0 <<16)) & 0xFF000000) | ((c3 ^ (r1 <<16)) & 0x00FF0000) | ((c0 ^ (r2 >>16)) & 0x0000FF00) | ((c1 ^ (r3 >>16)) & 0x000000FF);
-	x3 = ((c3 ^ (r0 <<24)) & 0xFF000000) | ((c0 ^ (r1 >> 8)) & 0x00FF0000) | ((c1 ^ (r2 >> 8)) & 0x0000FF00) | ((c2 ^ (r3 >> 8)) & 0x000000FF);
+#define SUB_ROR3 { \
+	B33 = S33, B34 = S34, B35 = S35; \
+	S35 = S32; S34 = S31; S33 = S30; S32 = S29; S31 = S28; S30 = S27; S29 = S26; S28 = S25; S27 = S24; \
+	S26 = S23; S25 = S22; S24 = S21; S23 = S20; S22 = S19; S21 = S18; S20 = S17; S19 = S16; S18 = S15; \
+	S17 = S14; S16 = S13; S15 = S12; S14 = S11; S13 = S10; S12 = S09; S11 = S08; S10 = S07; S09 = S06; \
+	S08 = S05; S07 = S04; S06 = S03; S05 = S02; S04 = S01; S03 = S00; S02 = B35; S01 = B34; S00 = B33; \
 }
-#define mROR3 { \
-	B[ 6] = S[33], B[ 7] = S[34], B[ 8] = S[35]; \
-	S[35] = S[32]; S[34] = S[31]; S[33] = S[30]; S[32] = S[29]; S[31] = S[28]; S[30] = S[27]; S[29] = S[26]; S[28] = S[25]; S[27] = S[24]; \
-	S[26] = S[23]; S[25] = S[22]; S[24] = S[21]; S[23] = S[20]; S[22] = S[19]; S[21] = S[18]; S[20] = S[17]; S[19] = S[16]; S[18] = S[15]; \
-	S[17] = S[14]; S[16] = S[13]; S[15] = S[12]; S[14] = S[11]; S[13] = S[10]; S[12] = S[ 9]; S[11] = S[ 8]; S[10] = S[ 7]; S[ 9] = S[ 6]; \
-	S[ 8] = S[ 5]; S[ 7] = S[ 4]; S[ 6] = S[ 3]; S[ 5] = S[ 2]; S[ 4] = S[ 1]; S[ 3] = S[ 0]; S[ 2] = B[ 8]; S[ 1] = B[ 7]; S[ 0] = B[ 6]; \
-	}
 
-#define mROR8 { \
-	B[ 1] = S[28], B[ 2] = S[29], B[ 3] = S[30], B[ 4] = S[31], B[ 5] = S[32], B[ 6] = S[33], B[ 7] = S[34], B[ 8] = S[35]; \
-	S[35] = S[27]; S[34] = S[26]; S[33] = S[25]; S[32] = S[24]; S[31] = S[23]; S[30] = S[22]; S[29] = S[21]; S[28] = S[20]; S[27] = S[19]; \
-	S[26] = S[18]; S[25] = S[17]; S[24] = S[16]; S[23] = S[15]; S[22] = S[14]; S[21] = S[13]; S[20] = S[12]; S[19] = S[11]; S[18] = S[10]; \
-	S[17] = S[ 9]; S[16] = S[ 8]; S[15] = S[ 7]; S[14] = S[ 6]; S[13] = S[ 5]; S[12] = S[ 4]; S[11] = S[ 3]; S[10] = S[ 2]; S[ 9] = S[ 1]; \
-	S[ 8] = S[ 0]; S[ 7] = B[ 8]; S[ 6] = B[ 7]; S[ 5] = B[ 6]; S[ 4] = B[ 5]; S[ 3] = B[ 4]; S[ 2] = B[ 3]; S[ 1] = B[ 2]; S[ 0] = B[ 1]; \
-	}
+#define SUB_ROR8 { \
+	B28 = S28, B29 = S29, B30 = S30, B31 = S31, B32 = S32, B33 = S33, B34 = S34, B35 = S35; \
+	S35 = S27; S34 = S26; S33 = S25; S32 = S24; S31 = S23; S30 = S22; S29 = S21; S28 = S20; S27 = S19; \
+	S26 = S18; S25 = S17; S24 = S16; S23 = S15; S22 = S14; S21 = S13; S20 = S12; S19 = S11; S18 = S10; \
+	S17 = S09; S16 = S08; S15 = S07; S14 = S06; S13 = S05; S12 = S04; S11 = S03; S10 = S02; S09 = S01; \
+	S08 = S00; S07 = B35; S06 = B34; S05 = B33; S04 = B32; S03 = B31; S02 = B30; S01 = B29; S00 = B28; \
+}
 
-#define mROR9 { \
-	B[ 0] = S[27], B[ 1] = S[28], B[ 2] = S[29], B[ 3] = S[30], B[ 4] = S[31], B[ 5] = S[32], B[ 6] = S[33], B[ 7] = S[34], B[ 8] = S[35]; \
-	S[35] = S[26]; S[34] = S[25]; S[33] = S[24]; S[32] = S[23]; S[31] = S[22]; S[30] = S[21]; S[29] = S[20]; S[28] = S[19]; S[27] = S[18]; \
-	S[26] = S[17]; S[25] = S[16]; S[24] = S[15]; S[23] = S[14]; S[22] = S[13]; S[21] = S[12]; S[20] = S[11]; S[19] = S[10]; S[18] = S[ 9]; \
-	S[17] = S[ 8]; S[16] = S[ 7]; S[15] = S[ 6]; S[14] = S[ 5]; S[13] = S[ 4]; S[12] = S[ 3]; S[11] = S[ 2]; S[10] = S[ 1]; S[ 9] = S[ 0]; \
-	S[ 8] = B[ 8]; S[ 7] = B[ 7]; S[ 6] = B[ 6]; S[ 5] = B[ 5]; S[ 4] = B[ 4]; S[ 3] = B[ 3]; S[ 2] = B[ 2]; S[ 1] = B[ 1]; S[ 0] = B[ 0]; \
-	}
+#define SUB_ROR9 { \
+	B27 = S27, B28 = S28, B29 = S29, B30 = S30, B31 = S31, B32 = S32, B33 = S33, B34 = S34, B35 = S35; \
+	S35 = S26; S34 = S25; S33 = S24; S32 = S23; S31 = S22; S30 = S21; S29 = S20; S28 = S19; S27 = S18; \
+	S26 = S17; S25 = S16; S24 = S15; S23 = S14; S22 = S13; S21 = S12; S20 = S11; S19 = S10; S18 = S09; \
+	S17 = S08; S16 = S07; S15 = S06; S14 = S05; S13 = S04; S12 = S03; S11 = S02; S10 = S01; S09 = S00; \
+	S08 = B35; S07 = B34; S06 = B33; S05 = B32; S04 = B31; S03 = B30; S02 = B29; S01 = B28; S00 = B27; \
+}
 
-#define FUGUE512_3(x, y, z) {  \
-        TIX4(x, S[ 0], S[ 1], S[ 4], S[ 7], S[ 8], S[22], S[24], S[27], S[30]); \
-        CMIX36(S[33], S[34], S[35], S[ 1], S[ 2], S[ 3], S[15], S[16], S[17]); \
-        SMIX_LDG(shared, S[33], S[34], S[35], S[ 0]); \
-        CMIX36(S[30], S[31], S[32], S[34], S[35], S[ 0], S[12], S[13], S[14]); \
-        SMIX_LDG(shared, S[30], S[31], S[32], S[33]); \
-        CMIX36(S[27], S[28], S[29], S[31], S[32], S[33], S[ 9], S[10], S[11]); \
-        SMIX(shared, S[27], S[28], S[29], S[30]); \
-        CMIX36(S[24], S[25], S[26], S[28], S[29], S[30], S[ 6], S[ 7], S[ 8]); \
-        SMIX_LDG(shared, S[24], S[25], S[26], S[27]); \
-        \
-        TIX4(y, S[24], S[25], S[28], S[31], S[32], S[10], S[12], S[15], S[18]); \
-        CMIX36(S[21], S[22], S[23], S[25], S[26], S[27], S[ 3], S[ 4], S[ 5]); \
-        SMIX(shared, S[21], S[22], S[23], S[24]); \
-        CMIX36(S[18], S[19], S[20], S[22], S[23], S[24], S[ 0], S[ 1], S[ 2]); \
-        SMIX_LDG(shared, S[18], S[19], S[20], S[21]); \
-        CMIX36(S[15], S[16], S[17], S[19], S[20], S[21], S[33], S[34], S[35]); \
-        SMIX_LDG(shared, S[15], S[16], S[17], S[18]); \
-        CMIX36(S[12], S[13], S[14], S[16], S[17], S[18], S[30], S[31], S[32]); \
-        SMIX(shared, S[12], S[13], S[14], S[15]); \
-        \
-        TIX4(z, S[12], S[13], S[16], S[19], S[20], S[34], S[ 0], S[ 3], S[ 6]); \
-        CMIX36(S[ 9], S[10], S[11], S[13], S[14], S[15], S[27], S[28], S[29]); \
-        SMIX_LDG(shared, S[ 9], S[10], S[11], S[12]); \
-        CMIX36(S[ 6], S[ 7], S[ 8], S[10], S[11], S[12], S[24], S[25], S[26]); \
-        SMIX_LDG(shared, S[ 6], S[ 7], S[ 8], S[ 9]); \
-        CMIX36(S[ 3], S[ 4], S[ 5], S[ 7], S[ 8], S[ 9], S[21], S[22], S[23]); \
-        SMIX_LDG(shared, S[ 3], S[ 4], S[ 5], S[ 6]); \
-        CMIX36(S[ 0], S[ 1], S[ 2], S[ 4], S[ 5], S[ 6], S[18], S[19], S[20]); \
-        SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]); \
-	}
+#define FUGUE512_3(x, y, z) { \
+	TIX4(x, S00, S01, S04, S07, S08, S22, S24, S27, S30); \
+	CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \
+	SMIX(S33, S34, S35, S00); \
+	CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \
+	SMIX(S30, S31, S32, S33); \
+	CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \
+	SMIX(S27, S28, S29, S30); \
+	CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \
+	SMIX(S24, S25, S26, S27); \
+	\
+	TIX4(y, S24, S25, S28, S31, S32, S10, S12, S15, S18); \
+	CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05); \
+	SMIX(S21, S22, S23, S24); \
+	CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02); \
+	SMIX(S18, S19, S20, S21); \
+	CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35); \
+	SMIX(S15, S16, S17, S18); \
+	CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32); \
+	SMIX(S12, S13, S14, S15); \
+	\
+	TIX4(z, S12, S13, S16, S19, S20, S34, S00, S03, S06); \
+	CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29); \
+	SMIX(S09, S10, S11, S12); \
+	CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26); \
+	SMIX(S06, S07, S08, S09); \
+	CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23); \
+	SMIX(S03, S04, S05, S06); \
+	CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); \
+	SMIX(S00, S01, S02, S03); \
+}
+
+
+#define AS_UINT4(addr) *((uint4*)(addr))
 
 /***************************************************/
-// Die Hash-Funktion
-__global__ __launch_bounds__(256,3)
+__global__
+__launch_bounds__(TPB)
 void x13_fugue512_gpu_hash_64(uint32_t threads, uint64_t *g_hash)
 {
-	__shared__ uint32_t shared[4][256];
-
-//	if(threadIdx.x<256){
-		const uint32_t tmp = mixtab0[threadIdx.x];
-		shared[0][threadIdx.x] = tmp;
-		shared[1][threadIdx.x] = ROR8(tmp);
-		shared[2][threadIdx.x] = ROL16(tmp);
-		shared[3][threadIdx.x] = ROL8(tmp);
-//	}
-	__syncthreads();
-	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
-	{
-		uint32_t *hash = (uint32_t*)&g_hash[thread<<3];
-
-		uint32_t S[36];
-		uint32_t B[ 9];
-
-		uint32_t Hash[16];
-		
-		*(uint2x4*)&Hash[0] = __ldg4((uint2x4*)&hash[0]);
-		*(uint2x4*)&Hash[8] = __ldg4((uint2x4*)&hash[8]);
-		__syncthreads();
-		
-		S[ 0] = S[ 1] = S[ 2] = S[ 3] = S[ 4] = S[ 5] = S[ 6] = S[ 7] = S[ 8] = S[ 9] = S[10] = S[11] = S[12] = S[13] = S[14] = S[15] = S[16] = S[17] = S[18] = S[19] = 0;
-		*(uint2x4*)&S[20] = *(uint2x4*)&c_S[ 0];
-		*(uint2x4*)&S[28] = *(uint2x4*)&c_S[ 8];
-
-		FUGUE512_3(Hash[0x0], Hash[0x1], Hash[0x2]);
-		FUGUE512_3(Hash[0x3], Hash[0x4], Hash[0x5]);
-		FUGUE512_3(Hash[0x6], Hash[0x7], Hash[0x8]);
-		FUGUE512_3(Hash[0x9], Hash[0xA], Hash[0xB]);
-		FUGUE512_3(Hash[0xC], Hash[0xD], Hash[0xE]);
-		FUGUE512_3(Hash[0xF], 0U, 512U);
-
-		for (uint32_t i = 0; i < 32; i+=2){
-			mROR3;
-			CMIX36(S[ 0], S[ 1], S[ 2], S[ 4], S[ 5], S[ 6], S[18], S[19], S[20]);
-			SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
-			mROR3;
-			CMIX36(S[ 0], S[ 1], S[ 2], S[ 4], S[ 5], S[ 6], S[18], S[19], S[20]);
-			SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
-		}
-		#pragma unroll
-		for (uint32_t i = 0; i < 13; i ++) {
-			S[ 4] ^= S[ 0];	S[ 9] ^= S[ 0];	S[18] ^= S[ 0];	S[27] ^= S[ 0];
-			mROR9;
-			SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
-			S[ 4] ^= S[ 0];	S[10] ^= S[ 0];	S[18] ^= S[ 0];	S[27] ^= S[ 0];
-			mROR9;
-			SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
-			S[ 4] ^= S[ 0];	S[10] ^= S[ 0];	S[19] ^= S[ 0];	S[27] ^= S[ 0];
-			mROR9;
-			SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
-			S[ 4] ^= S[ 0];	S[10] ^= S[ 0];	S[19] ^= S[ 0];	S[28] ^= S[ 0];
-			mROR8;
-			SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
-		}
-		S[ 4] ^= S[ 0];	S[ 9] ^= S[ 0];	S[18] ^= S[ 0];	S[27] ^= S[ 0];
-
-		S[ 0] = cuda_swab32(S[ 1]);	S[ 1] = cuda_swab32(S[ 2]);	S[ 2] = cuda_swab32(S[ 3]);	S[ 3] = cuda_swab32(S[ 4]);
-		S[ 4] = cuda_swab32(S[ 9]);	S[ 5] = cuda_swab32(S[10]);	S[ 6] = cuda_swab32(S[11]);	S[ 7] = cuda_swab32(S[12]);
-		S[ 8] = cuda_swab32(S[18]);	S[ 9] = cuda_swab32(S[19]);	S[10] = cuda_swab32(S[20]);	S[11] = cuda_swab32(S[21]);
-		S[12] = cuda_swab32(S[27]);	S[13] = cuda_swab32(S[28]);	S[14] = cuda_swab32(S[29]);	S[15] = cuda_swab32(S[30]);
-		
-		*(uint2x4*)&hash[ 0] = *(uint2x4*)&S[ 0];
-		*(uint2x4*)&hash[ 8] = *(uint2x4*)&S[ 8];
+	__shared__ uint32_t mixtabs[1024];
+
+	// load shared mem (with 256 threads)
+	const uint32_t thr = threadIdx.x & 0xFF;
+	const uint32_t tmp = tex1Dfetch(mixTab0Tex, thr);
+	mixtabs[thr] = tmp;
+	mixtabs[thr+256] = ROR8(tmp);
+	mixtabs[thr+512] = ROL16(tmp);
+	mixtabs[thr+768] = ROL8(tmp);
+#if TPB <= 256
+	if (blockDim.x < 256) {
+		const uint32_t thr = (threadIdx.x + 0x80) & 0xFF;
+		const uint32_t tmp = tex1Dfetch(mixTab0Tex, thr);
+		mixtabs[thr] = tmp;
+		mixtabs[thr + 256] = ROR8(tmp);
+		mixtabs[thr + 512] = ROL16(tmp);
+		mixtabs[thr + 768] = ROL8(tmp);
 	}
-}
+#endif
 
-/***************************************************/
-// The final hash function
-__global__ __launch_bounds__(512,2) /* force 56 registers */
-void x13_fugue512_gpu_hash_64_final(uint32_t threads,const uint32_t* __restrict__ g_hash,uint32_t* resNonce, const uint64_t target){
-
-	__shared__ uint32_t shared[4][256];
-
-	if(threadIdx.x<256){
-		const uint32_t tmp = mixtab0[threadIdx.x];
-		shared[0][threadIdx.x] = tmp;
-		shared[1][threadIdx.x] = ROR8(tmp);
-		shared[2][threadIdx.x] = ROL16(tmp);
-		shared[3][threadIdx.x] = ROL8(tmp);
-	}
+	__syncthreads();
 
-	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		const uint32_t* __restrict__ hash = &g_hash[thread<<4];
-
-		uint32_t S[36];
-		uint32_t B[ 9];
+		const size_t hashPosition = thread;
+		uint64_t*pHash = &g_hash[hashPosition<<3];
 		uint32_t Hash[16];
-		
-		*(uint2x4*)&Hash[0] = __ldg4((uint2x4*)&hash[0]);
-		*(uint2x4*)&Hash[8] = __ldg4((uint2x4*)&hash[8]);
-		__syncthreads();		
-		S[ 0] = S[ 1] = S[ 2] = S[ 3] = S[ 4] = S[ 5] = S[ 6] = S[ 7] = S[ 8] = S[ 9] = S[10] = S[11] = S[12] = S[13] = S[14] = S[15] = S[16] = S[17] = S[18] = S[19] = 0;
-		*(uint2x4*)&S[20] = *(uint2x4*)&c_S[ 0];
-		*(uint2x4*)&S[28] = *(uint2x4*)&c_S[ 8];
-
-		FUGUE512_3(Hash[0x0], Hash[0x1], Hash[0x2]);
-		FUGUE512_3(Hash[0x3], Hash[0x4], Hash[0x5]);
-		FUGUE512_3(Hash[0x6], Hash[0x7], Hash[0x8]);
-		FUGUE512_3(Hash[0x9], Hash[0xA], Hash[0xB]);
-		FUGUE512_3(Hash[0xC], Hash[0xD], Hash[0xE]);
-		FUGUE512_3(Hash[0xF], 0, 512);
-
-		for (int i = 0; i < 32; i++){
-			mROR3;
-			CMIX36(S[ 0], S[ 1], S[ 2], S[ 4], S[ 5], S[ 6], S[18], S[19], S[20]);
-			SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);			
-		}
-		#pragma unroll
-		for (int i = 0; i < 12; i++) {
-			S[ 4] ^= S[ 0];	S[ 9] ^= S[ 0];	S[18] ^= S[ 0];	S[27] ^= S[ 0];
-			mROR9;
-			SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
-			S[ 4] ^= S[ 0];	S[10] ^= S[ 0];	S[18] ^= S[ 0];	S[27] ^= S[ 0];
-			mROR9;
-			SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
-			S[ 4] ^= S[ 0];	S[10] ^= S[ 0];	S[19] ^= S[ 0];	S[27] ^= S[ 0];
-			mROR9;
-			SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
-			S[ 4] ^= S[ 0];	S[10] ^= S[ 0];	S[19] ^= S[ 0];	S[28] ^= S[ 0];
-			mROR8;
-			SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
+
+		#pragma unroll 4
+		for(int i = 0; i < 4; i++)
+			AS_UINT4(&Hash[i*4]) = AS_UINT4(&pHash[i*2]);
+
+		#pragma unroll 16
+		for(int i = 0; i < 16; i++)
+			Hash[i] = cuda_swab32(Hash[i]);
+
+		uint32_t S00, S01, S02, S03, S04, S05, S06, S07, S08, S09;
+		uint32_t S10, S11, S12, S13, S14, S15, S16, S17, S18, S19;
+		uint32_t S20, S21, S22, S23, S24, S25, S26, S27, S28, S29;
+		uint32_t S30, S31, S32, S33, S34, S35;
+
+		uint32_t B27, B28, B29, B30, B31, B32, B33, B34, B35;
+		//const uint64_t bc = (64ULL << 3); // 512
+		//const uint32_t bclo = (uint32_t)(bc);
+		//const uint32_t bchi = (uint32_t)(bc >> 32);
+
+		S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = 0;
+		S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0;
+		S20 = 0x8807a57e; S21 = 0xe616af75; S22 = 0xc5d3e4db; S23 = 0xac9ab027;
+		S24 = 0xd915f117; S25 = 0xb6eecc54; S26 = 0x06e8020b; S27 = 0x4a92efd1;
+		S28 = 0xaac6e2c9; S29 = 0xddb21398; S30 = 0xcae65838; S31 = 0x437f203f;
+		S32 = 0x25ea78e7; S33 = 0x951fddd6; S34 = 0xda6ed11d; S35 = 0xe13e3567;
+
+		FUGUE512_3((Hash[0x0]), (Hash[0x1]), (Hash[0x2]));
+		FUGUE512_3((Hash[0x3]), (Hash[0x4]), (Hash[0x5]));
+		FUGUE512_3((Hash[0x6]), (Hash[0x7]), (Hash[0x8]));
+		FUGUE512_3((Hash[0x9]), (Hash[0xA]), (Hash[0xB]));
+		FUGUE512_3((Hash[0xC]), (Hash[0xD]), (Hash[0xE]));
+		FUGUE512_3((Hash[0xF]), 0u /*bchi*/, 512u /*bclo*/);
+
+		#pragma unroll 32
+		for (int i = 0; i < 32; i ++) {
+			SUB_ROR3;
+			CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20);
+			SMIX(S00, S01, S02, S03);
 		}
-		S[ 4] ^= S[ 0];	S[ 9] ^= S[ 0];	S[18] ^= S[ 0];	S[27] ^= S[ 0];
-		mROR9;
-		SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
-		S[ 4] ^= S[ 0];	S[10] ^= S[ 0];	S[18] ^= S[ 0];	S[27] ^= S[ 0];
-		mROR9;
-		SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
-		S[ 4] ^= S[ 0];	S[10] ^= S[ 0];	S[19] ^= S[ 0];	S[27] ^= S[ 0];
-		mROR9;
-		SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]);
-
-		S[ 3] = cuda_swab32(S[3]);	S[ 4] = cuda_swab32(S[4]^S[ 0]);
-		
-		const uint64_t check = *(uint64_t*)&S[ 3];
-		if(check <= target){
-			uint32_t tmp = atomicExch(&resNonce[0], thread);
-			if (tmp != UINT32_MAX)
-				resNonce[1] = tmp;		
+		#pragma unroll 13
+		for (int i = 0; i < 13; i++) {
+			S04 ^= S00;
+			S09 ^= S00;
+			S18 ^= S00;
+			S27 ^= S00;
+			SUB_ROR9;
+			SMIX(S00, S01, S02, S03);
+			S04 ^= S00;
+			S10 ^= S00;
+			S18 ^= S00;
+			S27 ^= S00;
+			SUB_ROR9;
+			SMIX(S00, S01, S02, S03);
+			S04 ^= S00;
+			S10 ^= S00;
+			S19 ^= S00;
+			S27 ^= S00;
+			SUB_ROR9;
+			SMIX(S00, S01, S02, S03);
+			S04 ^= S00;
+			S10 ^= S00;
+			S19 ^= S00;
+			S28 ^= S00;
+			SUB_ROR8;
+			SMIX(S00, S01, S02, S03);
 		}
+		S04 ^= S00;
+		S09 ^= S00;
+		S18 ^= S00;
+		S27 ^= S00;
+
+		Hash[0] = cuda_swab32(S01);
+		Hash[1] = cuda_swab32(S02);
+		Hash[2] = cuda_swab32(S03);
+		Hash[3] = cuda_swab32(S04);
+		Hash[4] = cuda_swab32(S09);
+		Hash[5] = cuda_swab32(S10);
+		Hash[6] = cuda_swab32(S11);
+		Hash[7] = cuda_swab32(S12);
+		Hash[8] = cuda_swab32(S18);
+		Hash[9] = cuda_swab32(S19);
+		Hash[10] = cuda_swab32(S20);
+		Hash[11] = cuda_swab32(S21);
+		Hash[12] = cuda_swab32(S27);
+		Hash[13] = cuda_swab32(S28);
+		Hash[14] = cuda_swab32(S29);
+		Hash[15] = cuda_swab32(S30);
+
+		#pragma unroll 4
+		for(int i = 0; i < 4; i++)
+			AS_UINT4(&pHash[i*2]) = AS_UINT4(&Hash[i*4]);
 	}
 }
 
-__host__
-void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash){
-
-	const uint32_t threadsperblock = 256;
-
-	// berechne wie viele Thread Blocks wir brauchen
-	dim3 grid((threads + threadsperblock-1)/threadsperblock);
-	dim3 block(threadsperblock);
+#define texDef(id, texname, texmem, texsource, texsize) { \
+	unsigned int *texmem; \
+	cudaMalloc(&texmem, texsize); \
+	d_textures[thr_id][id] = texmem; \
+	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
+	texname.normalized = 0; \
+	texname.filterMode = cudaFilterModePoint; \
+	texname.addressMode[0] = cudaAddressModeClamp; \
+	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
+	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \
+	} \
+}
 
-	x13_fugue512_gpu_hash_64<<<grid, block>>>(threads, (uint64_t*)d_hash);
+__host__
+void x13_fugue512_cpu_init(int thr_id, uint32_t threads)
+{
+	texDef(0, mixTab0Tex, mixTab0m, mixtab0, sizeof(uint32_t)*256);
 }
 
 __host__
-void x13_fugue512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target){
+void x13_fugue512_cpu_free(int thr_id)
+{
+	cudaFree(d_textures[thr_id][0]);
+}
 
-	const uint32_t threadsperblock = 512;
+__host__
+//void fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash)
+void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash)
+{
+	const uint32_t threadsperblock = TPB;
 
-	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	x13_fugue512_gpu_hash_64_final<<<grid, block>>>(threads, d_hash,d_resNonce,target);
+	x13_fugue512_gpu_hash_64 <<<grid, block>>> (threads, (uint64_t*)d_hash);
 }