diff --git a/x11/cuda_x11_simd512.cu b/x11/cuda_x11_simd512.cu index 1d36eba..8ae1b99 100644 --- a/x11/cuda_x11_simd512.cu +++ b/x11/cuda_x11_simd512.cu @@ -1,6 +1,8 @@ /*************************************************************************************************** * SIMD512 SM3+ CUDA IMPLEMENTATION (require cuda_x11_simd512_func.cuh) + * * sp - may 2018 + * */ #include "miner.h" @@ -8,6 +10,451 @@ #include "cuda_vectors_alexis.h" #include "cuda_x11_aes_alexis.cuh" +//#define INTENSIVE_GMF +//#include "cuda_x11_aes.cuh" +#include "x15/cuda_whirlpool_tables.cuh" + + +//--------START OF WHIRLPOOL DEVICE MACROS--------------------------------------------------------------------------- +__device__ static uint64_t b0[256]; +__device__ static uint64_t b7[256]; + +__constant__ static uint2 precomputed_round_key_64[72]; + +__device__ uint2 InitVector_RC[10]; + +__device__ __forceinline__ +void static TRANSFER(uint2 *const __restrict__ dst, const uint2 *const __restrict__ src){ + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; + dst[3] = src[3]; + dst[4] = src[4]; + dst[5] = src[5]; + dst[6] = src[6]; + dst[7] = src[7]; +} + +__device__ __forceinline__ +static uint2 d_ROUND_ELT_LDG(const uint2 sharedMemory[7][256], const uint2 *const __restrict__ in, const int i0, const int i1, const int i2, const int i3, const int i4, const int i5, const int i6, const int i7){ + uint2 ret = __ldg((uint2*)&b0[__byte_perm(in[i0].x, 0, 0x4440)]); + ret ^= sharedMemory[1][__byte_perm(in[i1].x, 0, 0x4441)]; + ret ^= sharedMemory[2][__byte_perm(in[i2].x, 0, 0x4442)]; + ret ^= sharedMemory[3][__byte_perm(in[i3].x, 0, 0x4443)]; + ret ^= sharedMemory[4][__byte_perm(in[i4].y, 0, 0x4440)]; + ret ^= ROR24(__ldg((uint2*)&b0[__byte_perm(in[i5].y, 0, 0x4441)])); + ret ^= ROR8(__ldg((uint2*)&b7[__byte_perm(in[i6].y, 0, 0x4442)])); + ret ^= __ldg((uint2*)&b7[__byte_perm(in[i7].y, 0, 0x4443)]); + return ret; +} + +__device__ __forceinline__ +static uint2 d_ROUND_ELT(const uint2 sharedMemory[7][256], const uint2 *const __restrict__ in, const int i0, const int i1, const int i2, const int i3, const int i4, const int i5, const int i6, const int i7){ + + uint2 ret = __ldg((uint2*)&b0[__byte_perm(in[i0].x, 0, 0x4440)]); + ret ^= sharedMemory[1][__byte_perm(in[i1].x, 0, 0x4441)]; + ret ^= sharedMemory[2][__byte_perm(in[i2].x, 0, 0x4442)]; + ret ^= sharedMemory[3][__byte_perm(in[i3].x, 0, 0x4443)]; + ret ^= sharedMemory[4][__byte_perm(in[i4].y, 0, 0x4440)]; + ret ^= sharedMemory[5][__byte_perm(in[i5].y, 0, 0x4441)]; + ret ^= ROR8(__ldg((uint2*)&b7[__byte_perm(in[i6].y, 0, 0x4442)])); + ret ^= __ldg((uint2*)&b7[__byte_perm(in[i7].y, 0, 0x4443)]); + return ret; +} + +__device__ __forceinline__ +static uint2 d_ROUND_ELT1_LDG(const uint2 sharedMemory[7][256], const uint2 *const __restrict__ in, const int i0, const int i1, const int i2, const int i3, const int i4, const int i5, const int i6, const int i7, const uint2 c0){ + + uint2 ret = __ldg((uint2*)&b0[__byte_perm(in[i0].x, 0, 0x4440)]); + ret ^= sharedMemory[1][__byte_perm(in[i1].x, 0, 0x4441)]; + ret ^= sharedMemory[2][__byte_perm(in[i2].x, 0, 0x4442)]; + ret ^= sharedMemory[3][__byte_perm(in[i3].x, 0, 0x4443)]; + ret ^= sharedMemory[4][__byte_perm(in[i4].y, 0, 0x4440)]; + ret ^= ROR24(__ldg((uint2*)&b0[__byte_perm(in[i5].y, 0, 0x4441)])); + ret ^= ROR8(__ldg((uint2*)&b7[__byte_perm(in[i6].y, 0, 0x4442)])); + ret ^= __ldg((uint2*)&b7[__byte_perm(in[i7].y, 0, 0x4443)]); + ret ^= c0; + return ret; +} + +__device__ __forceinline__ +static uint2 d_ROUND_ELT1(const uint2 sharedMemory[7][256], const uint2 *const __restrict__ in, const int i0, const int i1, const int i2, const int i3, const int i4, const int i5, const int i6, const int i7, const uint2 c0){ + uint2 ret = __ldg((uint2*)&b0[__byte_perm(in[i0].x, 0, 0x4440)]); + ret ^= sharedMemory[1][__byte_perm(in[i1].x, 0, 0x4441)]; + ret ^= sharedMemory[2][__byte_perm(in[i2].x, 0, 0x4442)]; + ret ^= sharedMemory[3][__byte_perm(in[i3].x, 0, 0x4443)]; + ret ^= sharedMemory[4][__byte_perm(in[i4].y, 0, 0x4440)]; + ret ^= sharedMemory[5][__byte_perm(in[i5].y, 0, 0x4441)]; + ret ^= ROR8(__ldg((uint2*)&b7[__byte_perm(in[i6].y, 0, 0x4442)]));//sharedMemory[6][__byte_perm(in[i6].y, 0, 0x4442)] + ret ^= __ldg((uint2*)&b7[__byte_perm(in[i7].y, 0, 0x4443)]);//sharedMemory[7][__byte_perm(in[i7].y, 0, 0x4443)] + ret ^= c0; + return ret; +} + +//--------END OF WHIRLPOOL DEVICE MACROS----------------------------------------------------------------------------- + +//---hamsi macros--- +__constant__ uint32_t d_alpha_n[] = { + 0xff00f0f0, 0xccccaaaa, 0xf0f0cccc, 0xff00aaaa, 0xccccaaaa, 0xf0f0ff00, 0xaaaacccc, 0xf0f0ff00, 0xf0f0cccc, 0xaaaaff00, 0xccccff00, 0xaaaaf0f0, 0xaaaaf0f0, 0xff00cccc, 0xccccf0f0, 0xff00aaaa, + 0xccccaaaa, 0xff00f0f0, 0xff00aaaa, 0xf0f0cccc, 0xf0f0ff00, 0xccccaaaa, 0xf0f0ff00, 0xaaaacccc, 0xaaaaff00, 0xf0f0cccc, 0xaaaaf0f0, 0xccccff00, 0xff00cccc, 0xaaaaf0f0, 0xff00aaaa, 0xccccf0f0 +}; + +__constant__ uint32_t d_alpha_f[] = { + 0xcaf9639c, 0x0ff0f9c0, 0x639c0ff0, 0xcaf9f9c0, 0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0, 0x639ccaf9, 0x639c0ff0, 0xf9c0caf9, 0x0ff0caf9, 0xf9c0639c, 0xf9c0639c, 0xcaf90ff0, 0x0ff0639c, 0xcaf9f9c0, + 0x0ff0f9c0, 0xcaf9639c, 0xcaf9f9c0, 0x639c0ff0, 0x639ccaf9, 0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0, 0xf9c0caf9, 0x639c0ff0, 0xf9c0639c, 0x0ff0caf9, 0xcaf90ff0, 0xf9c0639c, 0xcaf9f9c0, 0x0ff0639c +}; + +__constant__ uint32_t c_c[] = { + 0x73746565, 0x6c706172, 0x6b204172, 0x656e6265, 0x72672031, 0x302c2062, 0x75732032, 0x3434362c, + 0x20422d33, 0x30303120, 0x4c657576, 0x656e2d48, 0x65766572, 0x6c65652c, 0x2042656c, 0x6769756d +}; + +__constant__ uint32_t d_T512[1024] = { + 0xef0b0270, 0x3afd0000, 0x5dae0000, 0x69490000, 0x9b0f3c06, 0x4405b5f9, 0x66140a51, 0x924f5d0a, 0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000, 0x08695bf9, 0x6dfcf137, 0x509f6984, 0x9e69af68, + 0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000, 0x08695bf9, 0x6dfcf137, 0x509f6984, 0x9e69af68, 0x26600240, 0xddd80000, 0x722a0000, 0x4f060000, 0x936667ff, 0x29f944ce, 0x368b63d5, 0x0c26f262, + 0x145a3c00, 0xb9e90000, 0x61270000, 0xf1610000, 0xce613d6c, 0xb0493d78, 0x47a96720, 0xe18e24c5, 0x23671400, 0xc8b90000, 0xf4c70000, 0xfb750000, 0x73cd2465, 0xf8a6a549, 0x02c40a3f, 0xdc24e61f, + 0x23671400, 0xc8b90000, 0xf4c70000, 0xfb750000, 0x73cd2465, 0xf8a6a549, 0x02c40a3f, 0xdc24e61f, 0x373d2800, 0x71500000, 0x95e00000, 0x0a140000, 0xbdac1909, 0x48ef9831, 0x456d6d1f, 0x3daac2da, + 0x54285c00, 0xeaed0000, 0xc5d60000, 0xa1c50000, 0xb3a26770, 0x94a5c4e1, 0x6bb0419d, 0x551b3782, 0x9cbb1800, 0xb0d30000, 0x92510000, 0xed930000, 0x593a4345, 0xe114d5f4, 0x430633da, 0x78cace29, + 0x9cbb1800, 0xb0d30000, 0x92510000, 0xed930000, 0x593a4345, 0xe114d5f4, 0x430633da, 0x78cace29, 0xc8934400, 0x5a3e0000, 0x57870000, 0x4c560000, 0xea982435, 0x75b11115, 0x28b67247, 0x2dd1f9ab, + 0x29449c00, 0x64e70000, 0xf24b0000, 0xc2f30000, 0x0ede4e8f, 0x56c23745, 0xf3e04259, 0x8d0d9ec4, 0x466d0c00, 0x08620000, 0xdd5d0000, 0xbadd0000, 0x6a927942, 0x441f2b93, 0x218ace6f, 0xbf2c0be2, + 0x466d0c00, 0x08620000, 0xdd5d0000, 0xbadd0000, 0x6a927942, 0x441f2b93, 0x218ace6f, 0xbf2c0be2, 0x6f299000, 0x6c850000, 0x2f160000, 0x782e0000, 0x644c37cd, 0x12dd1cd6, 0xd26a8c36, 0x32219526, + 0xf6800005, 0x3443c000, 0x24070000, 0x8f3d0000, 0x21373bfb, 0x0ab8d5ae, 0xcdc58b19, 0xd795ba31, 0xa67f0001, 0x71378000, 0x19fc0000, 0x96db0000, 0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, 0xac8e6c88, + 0xa67f0001, 0x71378000, 0x19fc0000, 0x96db0000, 0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, 0xac8e6c88, 0x50ff0004, 0x45744000, 0x3dfb0000, 0x19e60000, 0x1bbc5606, 0xe1727b5d, 0xe1a8cc96, 0x7b1bd6b9, + 0xf7750009, 0xcf3cc000, 0xc3d60000, 0x04920000, 0x029519a9, 0xf8e836ba, 0x7a87f14e, 0x9e16981a, 0xd46a0000, 0x8dc8c000, 0xa5af0000, 0x4a290000, 0xfc4e427a, 0xc9b4866c, 0x98369604, 0xf746c320, + 0xd46a0000, 0x8dc8c000, 0xa5af0000, 0x4a290000, 0xfc4e427a, 0xc9b4866c, 0x98369604, 0xf746c320, 0x231f0009, 0x42f40000, 0x66790000, 0x4ebb0000, 0xfedb5bd3, 0x315cb0d6, 0xe2b1674a, 0x69505b3a, + 0x774400f0, 0xf15a0000, 0xf5b20000, 0x34140000, 0x89377e8c, 0x5a8bec25, 0x0bc3cd1e, 0xcf3775cb, 0xf46c0050, 0x96180000, 0x14a50000, 0x031f0000, 0x42947eb8, 0x66bf7e19, 0x9ca470d2, 0x8a341574, + 0xf46c0050, 0x96180000, 0x14a50000, 0x031f0000, 0x42947eb8, 0x66bf7e19, 0x9ca470d2, 0x8a341574, 0x832800a0, 0x67420000, 0xe1170000, 0x370b0000, 0xcba30034, 0x3c34923c, 0x9767bdcc, 0x450360bf, + 0xe8870170, 0x9d720000, 0x12db0000, 0xd4220000, 0xf2886b27, 0xa921e543, 0x4ef8b518, 0x618813b1, 0xb4370060, 0x0c4c0000, 0x56c20000, 0x5cae0000, 0x94541f3f, 0x3b3ef825, 0x1b365f3d, 0xf3d45758, + 0xb4370060, 0x0c4c0000, 0x56c20000, 0x5cae0000, 0x94541f3f, 0x3b3ef825, 0x1b365f3d, 0xf3d45758, 0x5cb00110, 0x913e0000, 0x44190000, 0x888c0000, 0x66dc7418, 0x921f1d66, 0x55ceea25, 0x925c44e9, + 0x0c720000, 0x49e50f00, 0x42790000, 0x5cea0000, 0x33aa301a, 0x15822514, 0x95a34b7b, 0xb44b0090, 0xfe220000, 0xa7580500, 0x25d10000, 0xf7600000, 0x893178da, 0x1fd4f860, 0x4ed0a315, 0xa123ff9f, + 0xfe220000, 0xa7580500, 0x25d10000, 0xf7600000, 0x893178da, 0x1fd4f860, 0x4ed0a315, 0xa123ff9f, 0xf2500000, 0xeebd0a00, 0x67a80000, 0xab8a0000, 0xba9b48c0, 0x0a56dd74, 0xdb73e86e, 0x1568ff0f, + 0x45180000, 0xa5b51700, 0xf96a0000, 0x3b480000, 0x1ecc142c, 0x231395d6, 0x16bca6b0, 0xdf33f4df, 0xb83d0000, 0x16710600, 0x379a0000, 0xf5b10000, 0x228161ac, 0xae48f145, 0x66241616, 0xc5c1eb3e, + 0xb83d0000, 0x16710600, 0x379a0000, 0xf5b10000, 0x228161ac, 0xae48f145, 0x66241616, 0xc5c1eb3e, 0xfd250000, 0xb3c41100, 0xcef00000, 0xcef90000, 0x3c4d7580, 0x8d5b6493, 0x7098b0a6, 0x1af21fe1, + 0x75a40000, 0xc28b2700, 0x94a40000, 0x90f50000, 0xfb7857e0, 0x49ce0bae, 0x1767c483, 0xaedf667e, 0xd1660000, 0x1bbc0300, 0x9eec0000, 0xf6940000, 0x03024527, 0xcf70fcf2, 0xb4431b17, 0x857f3c2b, + 0xd1660000, 0x1bbc0300, 0x9eec0000, 0xf6940000, 0x03024527, 0xcf70fcf2, 0xb4431b17, 0x857f3c2b, 0xa4c20000, 0xd9372400, 0x0a480000, 0x66610000, 0xf87a12c7, 0x86bef75c, 0xa324df94, 0x2ba05a55, + 0x75c90003, 0x0e10c000, 0xd1200000, 0xbaea0000, 0x8bc42f3e, 0x8758b757, 0xbb28761d, 0x00b72e2b, 0xeecf0001, 0x6f564000, 0xf33e0000, 0xa79e0000, 0xbdb57219, 0xb711ebc5, 0x4a3b40ba, 0xfeabf254, + 0xeecf0001, 0x6f564000, 0xf33e0000, 0xa79e0000, 0xbdb57219, 0xb711ebc5, 0x4a3b40ba, 0xfeabf254, 0x9b060002, 0x61468000, 0x221e0000, 0x1d740000, 0x36715d27, 0x30495c92, 0xf11336a7, 0xfe1cdc7f, + 0x86790000, 0x3f390002, 0xe19ae000, 0x98560000, 0x9565670e, 0x4e88c8ea, 0xd3dd4944, 0x161ddab9, 0x30b70000, 0xe5d00000, 0xf4f46000, 0x42c40000, 0x63b83d6a, 0x78ba9460, 0x21afa1ea, 0xb0a51834, + 0x30b70000, 0xe5d00000, 0xf4f46000, 0x42c40000, 0x63b83d6a, 0x78ba9460, 0x21afa1ea, 0xb0a51834, 0xb6ce0000, 0xdae90002, 0x156e8000, 0xda920000, 0xf6dd5a64, 0x36325c8a, 0xf272e8ae, 0xa6b8c28d, + 0x14190000, 0x23ca003c, 0x50df0000, 0x44b60000, 0x1b6c67b0, 0x3cf3ac75, 0x61e610b0, 0xdbcadb80, 0xe3430000, 0x3a4e0014, 0xf2c60000, 0xaa4e0000, 0xdb1e42a6, 0x256bbe15, 0x123db156, 0x3a4e99d7, + 0xe3430000, 0x3a4e0014, 0xf2c60000, 0xaa4e0000, 0xdb1e42a6, 0x256bbe15, 0x123db156, 0x3a4e99d7, 0xf75a0000, 0x19840028, 0xa2190000, 0xeef80000, 0xc0722516, 0x19981260, 0x73dba1e6, 0xe1844257, + 0x54500000, 0x0671005c, 0x25ae0000, 0x6a1e0000, 0x2ea54edf, 0x664e8512, 0xbfba18c3, 0x7e715d17, 0xbc8d0000, 0xfc3b0018, 0x19830000, 0xd10b0000, 0xae1878c4, 0x42a69856, 0x0012da37, 0x2c3b504e, + 0xbc8d0000, 0xfc3b0018, 0x19830000, 0xd10b0000, 0xae1878c4, 0x42a69856, 0x0012da37, 0x2c3b504e, 0xe8dd0000, 0xfa4a0044, 0x3c2d0000, 0xbb150000, 0x80bd361b, 0x24e81d44, 0xbfa8c2f4, 0x524a0d59, + 0x69510000, 0xd4e1009c, 0xc3230000, 0xac2f0000, 0xe4950bae, 0xcea415dc, 0x87ec287c, 0xbce1a3ce, 0xc6730000, 0xaf8d000c, 0xa4c10000, 0x218d0000, 0x23111587, 0x7913512f, 0x1d28ac88, 0x378dd173, + 0xc6730000, 0xaf8d000c, 0xa4c10000, 0x218d0000, 0x23111587, 0x7913512f, 0x1d28ac88, 0x378dd173, 0xaf220000, 0x7b6c0090, 0x67e20000, 0x8da20000, 0xc7841e29, 0xb7b744f3, 0x9ac484f4, 0x8b6c72bd, + 0xcc140000, 0xa5630000, 0x5ab90780, 0x3b500000, 0x4bd013ff, 0x879b3418, 0x694348c1, 0xca5a87fe, 0x819e0000, 0xec570000, 0x66320280, 0x95f30000, 0x5da92802, 0x48f43cbc, 0xe65aa22d, 0x8e67b7fa, + 0x819e0000, 0xec570000, 0x66320280, 0x95f30000, 0x5da92802, 0x48f43cbc, 0xe65aa22d, 0x8e67b7fa, 0x4d8a0000, 0x49340000, 0x3c8b0500, 0xaea30000, 0x16793bfd, 0xcf6f08a4, 0x8f19eaec, 0x443d3004, + 0x78230000, 0x12fc0000, 0xa93a0b80, 0x90a50000, 0x713e2879, 0x7ee98924, 0xf08ca062, 0x636f8bab, 0x02af0000, 0xb7280000, 0xba1c0300, 0x56980000, 0xba8d45d3, 0x8048c667, 0xa95c149a, 0xf4f6ea7b, + 0x02af0000, 0xb7280000, 0xba1c0300, 0x56980000, 0xba8d45d3, 0x8048c667, 0xa95c149a, 0xf4f6ea7b, 0x7a8c0000, 0xa5d40000, 0x13260880, 0xc63d0000, 0xcbb36daa, 0xfea14f43, 0x59d0b4f8, 0x979961d0, + 0xac480000, 0x1ba60000, 0x45fb1380, 0x03430000, 0x5a85316a, 0x1fb250b6, 0xfe72c7fe, 0x91e478f6, 0x1e4e0000, 0xdecf0000, 0x6df80180, 0x77240000, 0xec47079e, 0xf4a0694e, 0xcda31812, 0x98aa496e, + 0x1e4e0000, 0xdecf0000, 0x6df80180, 0x77240000, 0xec47079e, 0xf4a0694e, 0xcda31812, 0x98aa496e, 0xb2060000, 0xc5690000, 0x28031200, 0x74670000, 0xb6c236f4, 0xeb1239f8, 0x33d1dfec, 0x094e3198, + 0xaec30000, 0x9c4f0001, 0x79d1e000, 0x2c150000, 0x45cc75b3, 0x6650b736, 0xab92f78f, 0xa312567b, 0xdb250000, 0x09290000, 0x49aac000, 0x81e10000, 0xcafe6b59, 0x42793431, 0x43566b76, 0xe86cba2e, + 0xdb250000, 0x09290000, 0x49aac000, 0x81e10000, 0xcafe6b59, 0x42793431, 0x43566b76, 0xe86cba2e, 0x75e60000, 0x95660001, 0x307b2000, 0xadf40000, 0x8f321eea, 0x24298307, 0xe8c49cf9, 0x4b7eec55, + 0x58430000, 0x807e0000, 0x78330001, 0xc66b3800, 0xe7375cdc, 0x79ad3fdd, 0xac73fe6f, 0x3a4479b1, 0x1d5a0000, 0x2b720000, 0x488d0000, 0xaf611800, 0x25cb2ec5, 0xc879bfd0, 0x81a20429, 0x1e7536a6, + 0x1d5a0000, 0x2b720000, 0x488d0000, 0xaf611800, 0x25cb2ec5, 0xc879bfd0, 0x81a20429, 0x1e7536a6, 0x45190000, 0xab0c0000, 0x30be0001, 0x690a2000, 0xc2fc7219, 0xb1d4800d, 0x2dd1fa46, 0x24314f17, + 0xa53b0000, 0x14260000, 0x4e30001e, 0x7cae0000, 0x8f9e0dd5, 0x78dfaa3d, 0xf73168d8, 0x0b1b4946, 0x07ed0000, 0xb2500000, 0x8774000a, 0x970d0000, 0x437223ae, 0x48c76ea4, 0xf4786222, 0x9075b1ce, + 0x07ed0000, 0xb2500000, 0x8774000a, 0x970d0000, 0x437223ae, 0x48c76ea4, 0xf4786222, 0x9075b1ce, 0xa2d60000, 0xa6760000, 0xc9440014, 0xeba30000, 0xccec2e7b, 0x3018c499, 0x03490afa, 0x9b6ef888, + 0x88980000, 0x1f940000, 0x7fcf002e, 0xfb4e0000, 0xf158079a, 0x61ae9167, 0xa895706c, 0xe6107494, 0x0bc20000, 0xdb630000, 0x7e88000c, 0x15860000, 0x91fd48f3, 0x7581bb43, 0xf460449e, 0xd8b61463, + 0x0bc20000, 0xdb630000, 0x7e88000c, 0x15860000, 0x91fd48f3, 0x7581bb43, 0xf460449e, 0xd8b61463, 0x835a0000, 0xc4f70000, 0x01470022, 0xeec80000, 0x60a54f69, 0x142f2a24, 0x5cf534f2, 0x3ea660f7, + 0x52500000, 0x29540000, 0x6a61004e, 0xf0ff0000, 0x9a317eec, 0x452341ce, 0xcf568fe5, 0x5303130f, 0x538d0000, 0xa9fc0000, 0x9ef70006, 0x56ff0000, 0x0ae4004e, 0x92c5cdf9, 0xa9444018, 0x7f975691, + 0x538d0000, 0xa9fc0000, 0x9ef70006, 0x56ff0000, 0x0ae4004e, 0x92c5cdf9, 0xa9444018, 0x7f975691, 0x01dd0000, 0x80a80000, 0xf4960048, 0xa6000000, 0x90d57ea2, 0xd7e68c37, 0x6612cffd, 0x2c94459e, + 0xe6280000, 0x4c4b0000, 0xa8550000, 0xd3d002e0, 0xd86130b8, 0x98a7b0da, 0x289506b4, 0xd75a4897, 0xf0c50000, 0x59230000, 0x45820000, 0xe18d00c0, 0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, 0x56a7b19f, + 0xf0c50000, 0x59230000, 0x45820000, 0xe18d00c0, 0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, 0x56a7b19f, 0x16ed0000, 0x15680000, 0xedd70000, 0x325d0220, 0xe30c3689, 0x5a4ae643, 0xe375f8a8, 0x81fdf908, + 0xb4310000, 0x77330000, 0xb15d0000, 0x7fd004e0, 0x78a26138, 0xd116c35d, 0xd256d489, 0x4e6f74de, 0xe3060000, 0xbdc10000, 0x87130000, 0xbff20060, 0x2eba0a1a, 0x8db53751, 0x73c5ab06, 0x5bd61539, + 0xe3060000, 0xbdc10000, 0x87130000, 0xbff20060, 0x2eba0a1a, 0x8db53751, 0x73c5ab06, 0x5bd61539, 0x57370000, 0xcaf20000, 0x364e0000, 0xc0220480, 0x56186b22, 0x5ca3f40c, 0xa1937f8f, 0x15b961e7, + 0x02f20000, 0xa2810000, 0x873f0000, 0xe36c7800, 0x1e1d74ef, 0x073d2bd6, 0xc4c23237, 0x7f32259e, 0xbadd0000, 0x13ad0000, 0xb7e70000, 0xf7282800, 0xdf45144d, 0x361ac33a, 0xea5a8d14, 0x2a2c18f0, + 0xbadd0000, 0x13ad0000, 0xb7e70000, 0xf7282800, 0xdf45144d, 0x361ac33a, 0xea5a8d14, 0x2a2c18f0, 0xb82f0000, 0xb12c0000, 0x30d80000, 0x14445000, 0xc15860a2, 0x3127e8ec, 0x2e98bf23, 0x551e3d6e, + 0x1e6c0000, 0xc4420000, 0x8a2e0000, 0xbcb6b800, 0x2c4413b6, 0x8bfdd3da, 0x6a0c1bc8, 0xb99dc2eb, 0x92560000, 0x1eda0000, 0xea510000, 0xe8b13000, 0xa93556a5, 0xebfb6199, 0xb15c2254, 0x33c5244f, + 0x92560000, 0x1eda0000, 0xea510000, 0xe8b13000, 0xa93556a5, 0xebfb6199, 0xb15c2254, 0x33c5244f, 0x8c3a0000, 0xda980000, 0x607f0000, 0x54078800, 0x85714513, 0x6006b243, 0xdb50399c, 0x8a58e6a4, + 0x033d0000, 0x08b30000, 0xf33a0000, 0x3ac20007, 0x51298a50, 0x6b6e661f, 0x0ea5cfe3, 0xe6da7ffe, 0xa8da0000, 0x96be0000, 0x5c1d0000, 0x07da0002, 0x7d669583, 0x1f98708a, 0xbb668808, 0xda878000, + 0xa8da0000, 0x96be0000, 0x5c1d0000, 0x07da0002, 0x7d669583, 0x1f98708a, 0xbb668808, 0xda878000, 0xabe70000, 0x9e0d0000, 0xaf270000, 0x3d180005, 0x2c4f1fd3, 0x74f61695, 0xb5c347eb, 0x3c5dfffe, + 0x01930000, 0xe7820000, 0xedfb0000, 0xcf0c000b, 0x8dd08d58, 0xbca3b42e, 0x063661e1, 0x536f9e7b, 0x92280000, 0xdc850000, 0x57fa0000, 0x56dc0003, 0xbae92316, 0x5aefa30c, 0x90cef752, 0x7b1675d7, + 0x92280000, 0xdc850000, 0x57fa0000, 0x56dc0003, 0xbae92316, 0x5aefa30c, 0x90cef752, 0x7b1675d7, 0x93bb0000, 0x3b070000, 0xba010000, 0x99d00008, 0x3739ae4e, 0xe64c1722, 0x96f896b3, 0x2879ebac, + 0x5fa80000, 0x56030000, 0x43ae0000, 0x64f30013, 0x257e86bf, 0x1311944e, 0x541e95bf, 0x8ea4db69, 0x00440000, 0x7f480000, 0xda7c0000, 0x2a230001, 0x3badc9cc, 0xa9b69c87, 0x030a9e60, 0xbe0a679e, + 0x00440000, 0x7f480000, 0xda7c0000, 0x2a230001, 0x3badc9cc, 0xa9b69c87, 0x030a9e60, 0xbe0a679e, 0x5fec0000, 0x294b0000, 0x99d20000, 0x4ed00012, 0x1ed34f73, 0xbaa708c9, 0x57140bdf, 0x30aebcf7, + 0xee930000, 0xd6070000, 0x92c10000, 0x2b9801e0, 0x9451287c, 0x3b6cfb57, 0x45312374, 0x201f6a64, 0x7b280000, 0x57420000, 0xa9e50000, 0x634300a0, 0x9edb442f, 0x6d9995bb, 0x27f83b03, 0xc7ff60f0, + 0x7b280000, 0x57420000, 0xa9e50000, 0x634300a0, 0x9edb442f, 0x6d9995bb, 0x27f83b03, 0xc7ff60f0, 0x95bb0000, 0x81450000, 0x3b240000, 0x48db0140, 0x0a8a6c53, 0x56f56eec, 0x62c91877, 0xe7e00a94 +}; + +#define SBOX(a, b, c, d) { \ + uint32_t t; \ + t =(a); \ + a =(a & c) ^ d; \ + c =(c ^ b) ^ a; \ + d =(d | t) ^ b; \ + b = d; \ + d =((d | (t ^ c)) ^ a); \ + a&= b; \ + t^=(c ^ a); \ + b = b ^ d ^ t; \ + (a) = (c); \ + (c) = (b); \ + (b) = (d); \ + (d) = (~t); \ + } + +#define HAMSI_L(a, b, c, d) { \ + (a) = ROTL32(a, 13); \ + (c) = ROTL32(c, 3); \ + (b) ^= (a) ^ (c); \ + (d) ^= (c) ^ ((a) << 3); \ + (b) = ROTL32(b, 1); \ + (d) = ROTL32(d, 7); \ + (a) = ROTL32(a ^ b ^ d, 5); \ + (c) = ROTL32(c ^ d ^ (b<<7), 22); \ + } + +#define ROUND_BIG(rc, alpha) { \ + m[ 0] ^= alpha[ 0]; \ + c[ 4] ^= alpha[ 8]; \ + m[ 8] ^= alpha[16]; \ + c[12] ^= alpha[24]; \ + m[ 1] ^= alpha[ 1] ^ (rc); \ + c[ 5] ^= alpha[ 9]; \ + m[ 9] ^= alpha[17]; \ + c[13] ^= alpha[25]; \ + c[ 0] ^= alpha[ 2]; \ + m[ 4] ^= alpha[10]; \ + c[ 8] ^= alpha[18]; \ + m[12] ^= alpha[26]; \ + c[ 1] ^= alpha[ 3]; \ + m[ 5] ^= alpha[11]; \ + c[ 9] ^= alpha[19]; \ + m[13] ^= alpha[27]; \ + m[ 2] ^= alpha[ 4]; \ + c[ 6] ^= alpha[12]; \ + m[10] ^= alpha[20]; \ + c[14] ^= alpha[28]; \ + m[ 3] ^= alpha[ 5]; \ + c[ 7] ^= alpha[13]; \ + m[11] ^= alpha[21]; \ + c[15] ^= alpha[29]; \ + c[ 2] ^= alpha[ 6]; \ + m[ 6] ^= alpha[14]; \ + c[10] ^= alpha[22]; \ + m[14] ^= alpha[30]; \ + c[ 3] ^= alpha[ 7]; \ + m[ 7] ^= alpha[15]; \ + c[11] ^= alpha[23]; \ + m[15] ^= alpha[31]; \ + SBOX(m[ 0], c[ 4], m[ 8], c[12]); \ + SBOX(m[ 1], c[ 5], m[ 9], c[13]); \ + SBOX(c[ 0], m[ 4], c[ 8], m[12]); \ + SBOX(c[ 1], m[ 5], c[ 9], m[13]); \ + HAMSI_L(m[ 0], c[ 5], c[ 8], m[13]); \ + SBOX(m[ 2], c[ 6], m[10], c[14]); \ + HAMSI_L(m[ 1], m[ 4], c[ 9], c[14]); \ + SBOX(m[ 3], c[ 7], m[11], c[15]); \ + HAMSI_L(c[ 0], m[ 5], m[10], c[15]); \ + SBOX(c[ 2], m[ 6], c[10], m[14]); \ + HAMSI_L(c[ 1], c[ 6], m[11], m[14]); \ + SBOX(c[ 3], m[ 7], c[11], m[15]); \ + HAMSI_L(m[ 2], c[ 7], c[10], m[15]); \ + HAMSI_L(m[ 3], m[ 6], c[11], c[12]); \ + HAMSI_L(c[ 2], m[ 7], m[ 8], c[13]); \ + HAMSI_L(c[ 3], c[ 4], m[ 9], m[12]); \ + HAMSI_L(m[ 0], c[ 0], m[ 3], c[ 3]); \ + HAMSI_L(m[ 8], c[ 9], m[11], c[10]); \ + HAMSI_L(c[ 5], m[ 5], c[ 6], m[ 6]); \ + HAMSI_L(c[13], m[12], c[14], m[15]); \ + } + + + +//------FUGUE MACROS-------------------------------------------------- +static __constant__ const uint32_t c_S[16] = { + 0x8807a57e, 0xe616af75, 0xc5d3e4db, 0xac9ab027, + 0xd915f117, 0xb6eecc54, 0x06e8020b, 0x4a92efd1, + 0xaac6e2c9, 0xddb21398, 0xcae65838, 0x437f203f, + 0x25ea78e7, 0x951fddd6, 0xda6ed11d, 0xe13e3567 +}; + +static __device__ uint32_t mixtab0[256] = { + 0x63633297, 0x7c7c6feb, 0x77775ec7, 0x7b7b7af7, 0xf2f2e8e5, 0x6b6b0ab7, 0x6f6f16a7, 0xc5c56d39, 0x303090c0, 0x01010704, 0x67672e87, 0x2b2bd1ac, 0xfefeccd5, 0xd7d71371, 0xabab7c9a, + 0x767659c3, 0xcaca4005, 0x8282a33e, 0xc9c94909, 0x7d7d68ef, 0xfafad0c5, 0x5959947f, 0x4747ce07, 0xf0f0e6ed, 0xadad6e82, 0xd4d41a7d, 0xa2a243be, 0xafaf608a, 0x9c9cf946, 0xa4a451a6, + 0x727245d3, 0xc0c0762d, 0xb7b728ea, 0xfdfdc5d9, 0x9393d47a, 0x2626f298, 0x363682d8, 0x3f3fbdfc, 0xf7f7f3f1, 0xcccc521d, 0x34348cd0, 0xa5a556a2, 0xe5e58db9, 0xf1f1e1e9, 0x71714cdf, + 0xd8d83e4d, 0x313197c4, 0x15156b54, 0x04041c10, 0xc7c76331, 0x2323e98c, 0xc3c37f21, 0x18184860, 0x9696cf6e, 0x05051b14, 0x9a9aeb5e, 0x0707151c, 0x12127e48, 0x8080ad36, 0xe2e298a5, + 0xebeba781, 0x2727f59c, 0xb2b233fe, 0x757550cf, 0x09093f24, 0x8383a43a, 0x2c2cc4b0, 0x1a1a4668, 0x1b1b416c, 0x6e6e11a3, 0x5a5a9d73, 0xa0a04db6, 0x5252a553, 0x3b3ba1ec, 0xd6d61475, + 0xb3b334fa, 0x2929dfa4, 0xe3e39fa1, 0x2f2fcdbc, 0x8484b126, 0x5353a257, 0xd1d10169, 0x00000000, 0xededb599, 0x2020e080, 0xfcfcc2dd, 0xb1b13af2, 0x5b5b9a77, 0x6a6a0db3, 0xcbcb4701, + 0xbebe17ce, 0x3939afe4, 0x4a4aed33, 0x4c4cff2b, 0x5858937b, 0xcfcf5b11, 0xd0d0066d, 0xefefbb91, 0xaaaa7b9e, 0xfbfbd7c1, 0x4343d217, 0x4d4df82f, 0x333399cc, 0x8585b622, 0x4545c00f, + 0xf9f9d9c9, 0x02020e08, 0x7f7f66e7, 0x5050ab5b, 0x3c3cb4f0, 0x9f9ff04a, 0xa8a87596, 0x5151ac5f, 0xa3a344ba, 0x4040db1b, 0x8f8f800a, 0x9292d37e, 0x9d9dfe42, 0x3838a8e0, 0xf5f5fdf9, + 0xbcbc19c6, 0xb6b62fee, 0xdada3045, 0x2121e784, 0x10107040, 0xffffcbd1, 0xf3f3efe1, 0xd2d20865, 0xcdcd5519, 0x0c0c2430, 0x1313794c, 0xececb29d, 0x5f5f8667, 0x9797c86a, 0x4444c70b, + 0x1717655c, 0xc4c46a3d, 0xa7a758aa, 0x7e7e61e3, 0x3d3db3f4, 0x6464278b, 0x5d5d886f, 0x19194f64, 0x737342d7, 0x60603b9b, 0x8181aa32, 0x4f4ff627, 0xdcdc225d, 0x2222ee88, 0x2a2ad6a8, + 0x9090dd76, 0x88889516, 0x4646c903, 0xeeeebc95, 0xb8b805d6, 0x14146c50, 0xdede2c55, 0x5e5e8163, 0x0b0b312c, 0xdbdb3741, 0xe0e096ad, 0x32329ec8, 0x3a3aa6e8, 0x0a0a3628, 0x4949e43f, + 0x06061218, 0x2424fc90, 0x5c5c8f6b, 0xc2c27825, 0xd3d30f61, 0xacac6986, 0x62623593, 0x9191da72, 0x9595c662, 0xe4e48abd, 0x797974ff, 0xe7e783b1, 0xc8c84e0d, 0x373785dc, 0x6d6d18af, + 0x8d8d8e02, 0xd5d51d79, 0x4e4ef123, 0xa9a97292, 0x6c6c1fab, 0x5656b943, 0xf4f4fafd, 0xeaeaa085, 0x6565208f, 0x7a7a7df3, 0xaeae678e, 0x08083820, 0xbaba0bde, 0x787873fb, 0x2525fb94, + 0x2e2ecab8, 0x1c1c5470, 0xa6a65fae, 0xb4b421e6, 0xc6c66435, 0xe8e8ae8d, 0xdddd2559, 0x747457cb, 0x1f1f5d7c, 0x4b4bea37, 0xbdbd1ec2, 0x8b8b9c1a, 0x8a8a9b1e, 0x70704bdb, 0x3e3ebaf8, + 0xb5b526e2, 0x66662983, 0x4848e33b, 0x0303090c, 0xf6f6f4f5, 0x0e0e2a38, 0x61613c9f, 0x35358bd4, 0x5757be47, 0xb9b902d2, 0x8686bf2e, 0xc1c17129, 0x1d1d5374, 0x9e9ef74e, 0xe1e191a9, + 0xf8f8decd, 0x9898e556, 0x11117744, 0x696904bf, 0xd9d93949, 0x8e8e870e, 0x9494c166, 0x9b9bec5a, 0x1e1e5a78, 0x8787b82a, 0xe9e9a989, 0xcece5c15, 0x5555b04f, 0x2828d8a0, 0xdfdf2b51, + 0x8c8c8906, 0xa1a14ab2, 0x89899212, 0x0d0d2334, 0xbfbf10ca, 0xe6e684b5, 0x4242d513, 0x686803bb, 0x4141dc1f, 0x9999e252, 0x2d2dc3b4, 0x0f0f2d3c, 0xb0b03df6, 0x5454b74b, 0xbbbb0cda, + 0x16166258 +}; + + +#define mixtab0(x) shared[0][x] +#define mixtab1(x) shared[1][x] +#define mixtab2(x) shared[2][x] +#define mixtab3(x) shared[3][x] + +#define TIX4(q, x00, x01, x04, x07, x08, x22, x24, x27, x30) { \ + x22 ^= x00; \ + x00 = (q); \ + x08 ^= (q); \ + x01 ^= x24; \ + x04 ^= x27; \ + x07 ^= x30; \ + } + +#define CMIX36(x00, x01, x02, x04, x05, x06, x18, x19, x20) { \ + x00 ^= x04; \ + x01 ^= x05; \ + x02 ^= x06; \ + x18 ^= x04; \ + x19 ^= x05; \ + x20 ^= x06; \ + } + +__device__ __forceinline__ +static void SMIX_LDG(const uint32_t shared[4][256], uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3){ + uint32_t c0 = __ldg(&mixtab0[__byte_perm(x0, 0, 0x4443)]); + uint32_t r1 = mixtab1(__byte_perm(x0, 0, 0x4442)); + uint32_t r2 = mixtab2(__byte_perm(x0, 0, 0x4441)); + uint32_t r3 = mixtab3(__byte_perm(x0, 0, 0x4440)); + c0 = c0 ^ r1 ^ r2 ^ r3; + uint32_t r0 = mixtab0(__byte_perm(x1, 0, 0x4443)); + uint32_t c1 = r0 ^ mixtab1(__byte_perm(x1, 0, 0x4442)); + uint32_t tmp = mixtab2(__byte_perm(x1, 0, 0x4441)); + c1 ^= tmp; + r2 ^= tmp; + tmp = mixtab3(__byte_perm(x1, 0, 0x4440)); + c1 ^= tmp; + r3 ^= tmp; + uint32_t c2 = __ldg(&mixtab0[__byte_perm(x2, 0, 0x4443)]); + r0 ^= c2; + tmp = mixtab1(__byte_perm(x2, 0, 0x4442)); + c2 ^= tmp; + r1 ^= tmp; + tmp = mixtab2(__byte_perm(x2, 0, 0x4441)); + c2 ^= tmp; + tmp = mixtab3(__byte_perm(x2, 0, 0x4440)); + c2 ^= tmp; + r3 ^= tmp; + uint32_t c3 = __ldg(&mixtab0[__byte_perm(x3, 0, 0x4443)]); + r0 ^= c3; + tmp = mixtab1(__byte_perm(x3, 0, 0x4442)); + c3 ^= tmp; + r1 ^= tmp; + tmp = mixtab2(__byte_perm(x3, 0, 0x4441)); + c3 ^= tmp; + r2 ^= tmp; + tmp = mixtab3(__byte_perm(x3, 0, 0x4440)); + c3 ^= tmp; + x0 = ((c0 ^ (r0 << 0)) & 0xFF000000) | ((c1 ^ (r1 << 0)) & 0x00FF0000) | ((c2 ^ (r2 << 0)) & 0x0000FF00) | ((c3 ^ (r3 << 0)) & 0x000000FF); + x1 = ((c1 ^ (r0 << 8)) & 0xFF000000) | ((c2 ^ (r1 << 8)) & 0x00FF0000) | ((c3 ^ (r2 << 8)) & 0x0000FF00) | ((c0 ^ (r3 >> 24)) & 0x000000FF); + x2 = ((c2 ^ (r0 << 16)) & 0xFF000000) | ((c3 ^ (r1 << 16)) & 0x00FF0000) | ((c0 ^ (r2 >> 16)) & 0x0000FF00) | ((c1 ^ (r3 >> 16)) & 0x000000FF); + x3 = ((c3 ^ (r0 << 24)) & 0xFF000000) | ((c0 ^ (r1 >> 8)) & 0x00FF0000) | ((c1 ^ (r2 >> 8)) & 0x0000FF00) | ((c2 ^ (r3 >> 8)) & 0x000000FF); +} + +__device__ __forceinline__ +static void SMIX(const uint32_t shared[4][256], uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3){ + uint32_t c0 = mixtab0(__byte_perm(x0, 0, 0x4443)); + uint32_t r1 = mixtab1(__byte_perm(x0, 0, 0x4442)); + uint32_t r2 = mixtab2(__byte_perm(x0, 0, 0x4441)); + uint32_t r3 = mixtab3(__byte_perm(x0, 0, 0x4440)); + c0 = c0 ^ r1 ^ r2 ^ r3; + uint32_t r0 = mixtab0(__byte_perm(x1, 0, 0x4443)); + uint32_t c1 = r0 ^ mixtab1(__byte_perm(x1, 0, 0x4442)); + uint32_t tmp = mixtab2(__byte_perm(x1, 0, 0x4441)); + c1 ^= tmp; + r2 ^= tmp; + tmp = mixtab3(__byte_perm(x1, 0, 0x4440)); + c1 ^= tmp; + r3 ^= tmp; + uint32_t c2 = mixtab0(__byte_perm(x2, 0, 0x4443)); + r0 ^= c2; + tmp = mixtab1(__byte_perm(x2, 0, 0x4442)); + c2 ^= tmp; + r1 ^= tmp; + tmp = mixtab2(__byte_perm(x2, 0, 0x4441)); + c2 ^= tmp; + tmp = mixtab3(__byte_perm(x2, 0, 0x4440)); + c2 ^= tmp; + r3 ^= tmp; + uint32_t c3 = mixtab0(__byte_perm(x3, 0, 0x4443)); + r0 ^= c3; + tmp = mixtab1(__byte_perm(x3, 0, 0x4442)); + c3 ^= tmp; + r1 ^= tmp; + tmp = mixtab2(__byte_perm(x3, 0, 0x4441)); + c3 ^= tmp; + r2 ^= tmp; + tmp = mixtab3(__byte_perm(x3, 0, 0x4440)); + c3 ^= tmp; + x0 = ((c0 ^ (r0 << 0)) & 0xFF000000) | ((c1 ^ (r1 << 0)) & 0x00FF0000) | ((c2 ^ (r2 << 0)) & 0x0000FF00) | ((c3 ^ (r3 << 0)) & 0x000000FF); + x1 = ((c1 ^ (r0 << 8)) & 0xFF000000) | ((c2 ^ (r1 << 8)) & 0x00FF0000) | ((c3 ^ (r2 << 8)) & 0x0000FF00) | ((c0 ^ (r3 >> 24)) & 0x000000FF); + x2 = ((c2 ^ (r0 << 16)) & 0xFF000000) | ((c3 ^ (r1 << 16)) & 0x00FF0000) | ((c0 ^ (r2 >> 16)) & 0x0000FF00) | ((c1 ^ (r3 >> 16)) & 0x000000FF); + x3 = ((c3 ^ (r0 << 24)) & 0xFF000000) | ((c0 ^ (r1 >> 8)) & 0x00FF0000) | ((c1 ^ (r2 >> 8)) & 0x0000FF00) | ((c2 ^ (r3 >> 8)) & 0x000000FF); +} + +#define mROR3 { \ + B[ 6] = S[33], B[ 7] = S[34], B[ 8] = S[35]; \ + S[35] = S[32]; S[34] = S[31]; S[33] = S[30]; S[32] = S[29]; S[31] = S[28]; S[30] = S[27]; S[29] = S[26]; S[28] = S[25]; S[27] = S[24]; \ + S[26] = S[23]; S[25] = S[22]; S[24] = S[21]; S[23] = S[20]; S[22] = S[19]; S[21] = S[18]; S[20] = S[17]; S[19] = S[16]; S[18] = S[15]; \ + S[17] = S[14]; S[16] = S[13]; S[15] = S[12]; S[14] = S[11]; S[13] = S[10]; S[12] = S[ 9]; S[11] = S[ 8]; S[10] = S[ 7]; S[ 9] = S[ 6]; \ + S[ 8] = S[ 5]; S[ 7] = S[ 4]; S[ 6] = S[ 3]; S[ 5] = S[ 2]; S[ 4] = S[ 1]; S[ 3] = S[ 0]; S[ 2] = B[ 8]; S[ 1] = B[ 7]; S[ 0] = B[ 6]; \ + } + +#define mROR8 { \ + B[ 1] = S[28]; B[ 2] = S[29]; B[ 3] = S[30]; B[ 4] = S[31]; B[ 5] = S[32]; B[ 6] = S[33]; B[ 7] = S[34]; B[ 8] = S[35]; \ + S[35] = S[27]; S[34] = S[26]; S[33] = S[25]; S[32] = S[24]; S[31] = S[23]; S[30] = S[22]; S[29] = S[21]; S[28] = S[20]; S[27] = S[19]; \ + S[26] = S[18]; S[25] = S[17]; S[24] = S[16]; S[23] = S[15]; S[22] = S[14]; S[21] = S[13]; S[20] = S[12]; S[19] = S[11]; S[18] = S[10]; \ + S[17] = S[ 9]; S[16] = S[ 8]; S[15] = S[ 7]; S[14] = S[ 6]; S[13] = S[ 5]; S[12] = S[ 4]; S[11] = S[ 3]; S[10] = S[ 2]; S[ 9] = S[ 1]; \ + S[ 8] = S[ 0]; S[ 7] = B[ 8]; S[ 6] = B[ 7]; S[ 5] = B[ 6]; S[ 4] = B[ 5]; S[ 3] = B[ 4]; S[ 2] = B[ 3]; S[ 1] = B[ 2]; S[ 0] = B[ 1]; \ + } + +#define mROR9 { \ + B[ 0] = S[27]; B[ 1] = S[28]; B[ 2] = S[29]; B[ 3] = S[30]; B[ 4] = S[31]; B[ 5] = S[32]; B[ 6] = S[33]; B[ 7] = S[34]; B[ 8] = S[35]; \ + S[35] = S[26]; S[34] = S[25]; S[33] = S[24]; S[32] = S[23]; S[31] = S[22]; S[30] = S[21]; S[29] = S[20]; S[28] = S[19]; S[27] = S[18]; \ + S[26] = S[17]; S[25] = S[16]; S[24] = S[15]; S[23] = S[14]; S[22] = S[13]; S[21] = S[12]; S[20] = S[11]; S[19] = S[10]; S[18] = S[ 9]; \ + S[17] = S[ 8]; S[16] = S[ 7]; S[15] = S[ 6]; S[14] = S[ 5]; S[13] = S[ 4]; S[12] = S[ 3]; S[11] = S[ 2]; S[10] = S[ 1]; S[ 9] = S[ 0]; \ + S[ 8] = B[ 8]; S[ 7] = B[ 7]; S[ 6] = B[ 6]; S[ 5] = B[ 5]; S[ 4] = B[ 4]; S[ 3] = B[ 3]; S[ 2] = B[ 2]; S[ 1] = B[ 1]; S[ 0] = B[ 0]; \ + } + +#define FUGUE512_3(x, y, z) { \ + TIX4(x, S[ 0], S[ 1], S[ 4], S[ 7], S[ 8], S[22], S[24], S[27], S[30]); \ + CMIX36(S[33], S[34], S[35], S[ 1], S[ 2], S[ 3], S[15], S[16], S[17]); \ + SMIX(shared, S[33], S[34], S[35], S[ 0]); \ + CMIX36(S[30], S[31], S[32], S[34], S[35], S[ 0], S[12], S[13], S[14]); \ + SMIX(shared, S[30], S[31], S[32], S[33]); \ + CMIX36(S[27], S[28], S[29], S[31], S[32], S[33], S[ 9], S[10], S[11]); \ + SMIX(shared, S[27], S[28], S[29], S[30]); \ + CMIX36(S[24], S[25], S[26], S[28], S[29], S[30], S[ 6], S[ 7], S[ 8]); \ + SMIX_LDG(shared, S[24], S[25], S[26], S[27]); \ + \ + TIX4(y, S[24], S[25], S[28], S[31], S[32], S[10], S[12], S[15], S[18]); \ + CMIX36(S[21], S[22], S[23], S[25], S[26], S[27], S[ 3], S[ 4], S[ 5]); \ + SMIX(shared, S[21], S[22], S[23], S[24]); \ + CMIX36(S[18], S[19], S[20], S[22], S[23], S[24], S[ 0], S[ 1], S[ 2]); \ + SMIX_LDG(shared, S[18], S[19], S[20], S[21]); \ + CMIX36(S[15], S[16], S[17], S[19], S[20], S[21], S[33], S[34], S[35]); \ + SMIX(shared, S[15], S[16], S[17], S[18]); \ + CMIX36(S[12], S[13], S[14], S[16], S[17], S[18], S[30], S[31], S[32]); \ + SMIX_LDG(shared, S[12], S[13], S[14], S[15]); \ + \ + TIX4(z, S[12], S[13], S[16], S[19], S[20], S[34], S[ 0], S[ 3], S[ 6]); \ + CMIX36(S[ 9], S[10], S[11], S[13], S[14], S[15], S[27], S[28], S[29]); \ + SMIX(shared, S[ 9], S[10], S[11], S[12]); \ + CMIX36(S[ 6], S[ 7], S[ 8], S[10], S[11], S[12], S[24], S[25], S[26]); \ + SMIX_LDG(shared, S[ 6], S[ 7], S[ 8], S[ 9]); \ + CMIX36(S[ 3], S[ 4], S[ 5], S[ 7], S[ 8], S[ 9], S[21], S[22], S[23]); \ + SMIX_LDG(shared, S[ 3], S[ 4], S[ 5], S[ 6]); \ + CMIX36(S[ 0], S[ 1], S[ 2], S[ 4], S[ 5], S[ 6], S[18], S[19], S[20]); \ + SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]); \ + } + +//------END OF FUGUE MACROS------------------------------------------- + + #ifdef __INTELLISENSE__ /* just for vstudio code colors */ @@ -163,7 +610,7 @@ __launch_bounds__(TPB52_2,1) #else __launch_bounds__(TPB50_2,4) #endif -static void x11_simd512_gpu_compress_64_maxwell(uint32_t threads, uint32_t *g_hash,const uint4 *const __restrict__ g_fft4) +static void x11_simd512_gpu_compress_64(uint32_t threads, uint32_t *g_hash,const uint4 *const __restrict__ g_fft4) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); const uint32_t thr_offset = thread << 6; // thr_id * 128 (je zwei elemente) @@ -222,6 +669,19 @@ __host__ int x11_simd512_cpu_init(int thr_id, uint32_t threads) { cudaMalloc(&d_temp4[thr_id], 64*sizeof(uint4)*threads); + + // whirlpool + uint64_t* table0 = NULL; + table0 = (uint64_t*)plain_T0; + cudaMemcpyToSymbol(InitVector_RC, plain_RC, 10 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(precomputed_round_key_64, plain_precomputed_round_key_64, 72 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(b0, table0, 256 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice); + uint64_t table7[256]; + for (int i = 0; i<256; i++){ + table7[i] = ROTR64(table0[i], 8); + } + cudaMemcpyToSymbol(b7, table7, 256 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice); + return 0; } @@ -273,7 +733,7 @@ __launch_bounds__(128, 5) #else __launch_bounds__(128, 5) #endif -static void x16_simd512_gpu_compress_64_maxwell_echo512(uint32_t *g_hash, const uint4 *const __restrict__ g_fft4) +static void x16_simd512_gpu_compress_64_echo512(uint32_t *g_hash, const uint4 *const __restrict__ g_fft4) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); const uint32_t thr_offset = thread << 6; // thr_id * 128 (je zwei elemente) @@ -433,6 +893,429 @@ static void x16_simd512_gpu_compress_64_maxwell_echo512(uint32_t *g_hash, const // } } +__global__ //__launch_bounds__(128, 5) +static void x16_simd512_gpu_compress_64_fugue512(uint32_t *g_hash, const uint4 *const __restrict__ g_fft4) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thr_offset = thread << 6; // thr_id * 128 (je zwei elemente) + __shared__ uint32_t shared[4][256]; + + if (threadIdx.x<128) + { + uint2 temp = __ldg(&((uint2*)&mixtab0)[threadIdx.x]); + + shared[0][(threadIdx.x << 1) + 0] = temp.x; + shared[0][(threadIdx.x << 1) + 1] = temp.y; + shared[1][(threadIdx.x << 1) + 0] = ROR8(temp.x); + shared[1][(threadIdx.x << 1) + 1] = ROR8(temp.y); + shared[2][(threadIdx.x << 1) + 0] = ROL16(temp.x); + shared[2][(threadIdx.x << 1) + 1] = ROL16(temp.y); + shared[3][(threadIdx.x << 1) + 0] = ROL8(temp.x); + shared[3][(threadIdx.x << 1) + 1] = ROL8(temp.y); + + + /* const uint32_t tmp = mixtab0[threadIdx.x]; + shared[0][threadIdx.x] = tmp; + shared[1][threadIdx.x] = ROR8(tmp); + shared[2][threadIdx.x] = ROL16(tmp); + shared[3][threadIdx.x] = ROL8(tmp); + */ + } + + + const uint32_t P[48] = { + 0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x01425eb8, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x65978b09, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x2cb6b661, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751, 0x9ac2dea3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x579f9f33, 0xfbfbfbfb, 0xfbfbfbfb, 0xefefd3c7, 0xdbfde1dd, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x34514d9e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xb134347e, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968, + 0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af + }; + uint32_t k0; + uint32_t h[16]; + + // if (thread < threads){ + + uint32_t *Hash = &g_hash[thread << 4]; + + uint32_t A[32]; + + *(uint2x4*)&A[0] = *(uint2x4*)&c_IV_512[0] ^ __ldg4((uint2x4*)&Hash[0]); + *(uint2x4*)&A[8] = *(uint2x4*)&c_IV_512[8] ^ __ldg4((uint2x4*)&Hash[8]); + *(uint2x4*)&A[16] = *(uint2x4*)&c_IV_512[16]; + *(uint2x4*)&A[24] = *(uint2x4*)&c_IV_512[24]; + + __syncthreads(); + + SIMD_Compress(A, thr_offset, g_fft4); + + /* + #pragma unroll 16 + for (int i = 0; i<16; i++){ + h[i] = A[i]; + } + *(uint2x4*)&p[0] = *(uint2x4*)&A[0]; + *(uint2x4*)&p[4] = *(uint2x4*)&A[8]; + */ + + // uint32_t *Hash = &g_hash[thread << 4]; + // uint8_t h1[64]; + uint32_t c[16], m[16]; + *(uint2x4*)&h[0] = *((uint2x4*)&A[0]); + *(uint2x4*)&h[8] = *((uint2x4*)&A[8]); + + for (int i = 0; i < 16; i++) + { + h[i] = cuda_swab32(h[i]); + } + + __syncthreads(); + + // *(uint2x4*)&Hash[ 0] = *(uint2x4*)&h[ 0]; + // *(uint2x4*)&Hash[ 8] = *(uint2x4*)&h[ 8]; + uint32_t S[36]; + uint32_t B[9]; + + S[0] = S[1] = S[2] = S[3] = S[4] = S[5] = S[6] = S[7] = S[8] = S[9] = S[10] = S[11] = S[12] = S[13] = S[14] = S[15] = S[16] = S[17] = S[18] = S[19] = 0; + *(uint2x4*)&S[20] = *(uint2x4*)&c_S[0]; +#pragma unroll 8 + for (int i = 0; i<8; i++){ + S[28 + i] = c_S[i + 8]; + } + + FUGUE512_3(h[0x0], h[0x1], h[0x2]); + FUGUE512_3(h[0x3], h[0x4], h[0x5]); + FUGUE512_3(h[0x6], h[0x7], h[0x8]); + FUGUE512_3(h[0x9], h[0xA], h[0xB]); + FUGUE512_3(h[0xC], h[0xD], h[0xE]); + FUGUE512_3(h[0xF], 0U, 512U); + + for (uint32_t i = 0; i < 32; i += 2){ + mROR3; + CMIX36(S[0], S[1], S[2], S[4], S[5], S[6], S[18], S[19], S[20]); + SMIX_LDG(shared, S[0], S[1], S[2], S[3]); + mROR3; + CMIX36(S[0], S[1], S[2], S[4], S[5], S[6], S[18], S[19], S[20]); + SMIX_LDG(shared, S[0], S[1], S[2], S[3]); + } + #pragma unroll 11 + for (uint32_t i = 0; i < 13; i++) { + S[4] ^= S[0]; S[9] ^= S[0]; S[18] ^= S[0]; S[27] ^= S[0]; + mROR9; + SMIX_LDG(shared, S[0], S[1], S[2], S[3]); + S[4] ^= S[0]; S[10] ^= S[0]; S[18] ^= S[0]; S[27] ^= S[0]; + mROR9; + SMIX(shared, S[0], S[1], S[2], S[3]); + S[4] ^= S[0]; S[10] ^= S[0]; S[19] ^= S[0]; S[27] ^= S[0]; + mROR9; + SMIX_LDG(shared, S[0], S[1], S[2], S[3]); + S[4] ^= S[0]; S[10] ^= S[0]; S[19] ^= S[0]; S[28] ^= S[0]; + mROR8; + SMIX_LDG(shared, S[0], S[1], S[2], S[3]); + } + S[4] ^= S[0]; S[9] ^= S[0]; S[18] ^= S[0]; S[27] ^= S[0]; + + S[0] = cuda_swab32(S[1]); S[1] = cuda_swab32(S[2]); S[2] = cuda_swab32(S[3]); S[3] = cuda_swab32(S[4]); + S[4] = cuda_swab32(S[9]); S[5] = cuda_swab32(S[10]); S[6] = cuda_swab32(S[11]); S[7] = cuda_swab32(S[12]); + S[8] = cuda_swab32(S[18]); S[9] = cuda_swab32(S[19]); S[10] = cuda_swab32(S[20]); S[11] = cuda_swab32(S[21]); + S[12] = cuda_swab32(S[27]); S[13] = cuda_swab32(S[28]); S[14] = cuda_swab32(S[29]); S[15] = cuda_swab32(S[30]); + + *(uint2x4*)&Hash[0] = *(uint2x4*)&S[0]; + *(uint2x4*)&Hash[8] = *(uint2x4*)&S[8]; + // } +} + +__global__ //__launch_bounds__(128, 5) +static void x16_simd512_gpu_compress_64_hamsi512(uint32_t *g_hash, const uint4 *const __restrict__ g_fft4) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thr_offset = thread << 6; // thr_id * 128 (je zwei elemente) + + const uint32_t P[48] = { + 0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x01425eb8, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x65978b09, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x2cb6b661, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751, 0x9ac2dea3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x579f9f33, 0xfbfbfbfb, 0xfbfbfbfb, 0xefefd3c7, 0xdbfde1dd, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x34514d9e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xb134347e, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968, + 0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af + }; + uint32_t k0; + uint32_t h[16]; + + // if (thread < threads){ + + uint32_t *Hash = &g_hash[thread << 4]; + + uint32_t A[32]; + + *(uint2x4*)&A[0] = *(uint2x4*)&c_IV_512[0] ^ __ldg4((uint2x4*)&Hash[0]); + *(uint2x4*)&A[8] = *(uint2x4*)&c_IV_512[8] ^ __ldg4((uint2x4*)&Hash[8]); + *(uint2x4*)&A[16] = *(uint2x4*)&c_IV_512[16]; + *(uint2x4*)&A[24] = *(uint2x4*)&c_IV_512[24]; + + __syncthreads(); + + SIMD_Compress(A, thr_offset, g_fft4); + + uint8_t h1[64]; + + // uint32_t c[16], m[16]; + // *(uint2x4*)&h[0] = *((uint2x4*)&A[0]); + // *(uint2x4*)&h[8] = *((uint2x4*)&A[8]); + + *(uint2x4*)&h1[0] = *(uint2x4*)&A[0]; + *(uint2x4*)&h1[32] = *(uint2x4*)&A[8]; + + uint32_t c[16], m[16]; + *(uint16*)&c[0] = *(uint16*)&c_c[0]; + *(uint16*)&h[0] = *(uint16*)&c_c[0]; + + const uint32_t *tp; + uint32_t dm; + + for (int i = 0; i < 64; i += 8) + { + tp = &d_T512[0]; + + dm = -(h1[i] & 1); + m[0] = dm & tp[0]; m[1] = dm & tp[1]; + m[2] = dm & tp[2]; m[3] = dm & tp[3]; + m[4] = dm & tp[4]; m[5] = dm & tp[5]; + m[6] = dm & tp[6]; m[7] = dm & tp[7]; + m[8] = dm & tp[8]; m[9] = dm & tp[9]; + m[10] = dm & tp[10]; m[11] = dm & tp[11]; + m[12] = dm & tp[12]; m[13] = dm & tp[13]; + m[14] = dm & tp[14]; m[15] = dm & tp[15]; + tp += 16; + //#pragma unroll 7 + for (int v = 1; v < 8; v++) { + dm = -((h1[i] >> v) & 1); + m[0] ^= dm & tp[0]; m[1] ^= dm & tp[1]; + m[2] ^= dm & tp[2]; m[3] ^= dm & tp[3]; + m[4] ^= dm & tp[4]; m[5] ^= dm & tp[5]; + m[6] ^= dm & tp[6]; m[7] ^= dm & tp[7]; + m[8] ^= dm & tp[8]; m[9] ^= dm & tp[9]; + m[10] ^= dm & tp[10]; m[11] ^= dm & tp[11]; + m[12] ^= dm & tp[12]; m[13] ^= dm & tp[13]; + m[14] ^= dm & tp[14]; m[15] ^= dm & tp[15]; + tp += 16; + } + + //#pragma unroll + for (int u = 1; u < 8; u++) { +#pragma unroll 8 + for (int v = 0; v < 8; v++) { + dm = -((h1[i + u] >> v) & 1); + m[0] ^= dm & tp[0]; m[1] ^= dm & tp[1]; + m[2] ^= dm & tp[2]; m[3] ^= dm & tp[3]; + m[4] ^= dm & tp[4]; m[5] ^= dm & tp[5]; + m[6] ^= dm & tp[6]; m[7] ^= dm & tp[7]; + m[8] ^= dm & tp[8]; m[9] ^= dm & tp[9]; + m[10] ^= dm & tp[10]; m[11] ^= dm & tp[11]; + m[12] ^= dm & tp[12]; m[13] ^= dm & tp[13]; + m[14] ^= dm & tp[14]; m[15] ^= dm & tp[15]; + tp += 16; + } + } + + //#pragma unroll 6 + for (int r = 0; r < 6; r++) { + ROUND_BIG(r, d_alpha_n); + } + /* order is (no more) important */ + h[0] ^= m[0]; h[1] ^= m[1]; h[2] ^= c[0]; h[3] ^= c[1]; + h[4] ^= m[2]; h[5] ^= m[3]; h[6] ^= c[2]; h[7] ^= c[3]; + h[8] ^= m[8]; h[9] ^= m[9]; h[10] ^= c[8]; h[11] ^= c[9]; + h[12] ^= m[10]; h[13] ^= m[11]; h[14] ^= c[10]; h[15] ^= c[11]; + + *(uint16*)&c[0] = *(uint16*)&h[0]; + } + + *(uint2x4*)&m[0] = *(uint2x4*)&d_T512[112]; + *(uint2x4*)&m[8] = *(uint2x4*)&d_T512[120]; + +#pragma unroll 6 + for (int r = 0; r < 6; r++) { + ROUND_BIG(r, d_alpha_n); + } + + /* order is (no more) important */ + h[0] ^= m[0]; h[1] ^= m[1]; h[2] ^= c[0]; h[3] ^= c[1]; + h[4] ^= m[2]; h[5] ^= m[3]; h[6] ^= c[2]; h[7] ^= c[3]; + h[8] ^= m[8]; h[9] ^= m[9]; h[10] ^= c[8]; h[11] ^= c[9]; + h[12] ^= m[10]; h[13] ^= m[11]; h[14] ^= c[10]; h[15] ^= c[11]; + + *(uint16*)&c[0] = *(uint16*)&h[0]; + + *(uint2x4*)&m[0] = *(uint2x4*)&d_T512[784]; + *(uint2x4*)&m[8] = *(uint2x4*)&d_T512[792]; + +#pragma unroll 12 + for (int r = 0; r < 12; r++) + ROUND_BIG(r, d_alpha_f); + + /* order is (no more) important */ + h[0] ^= m[0]; h[1] ^= m[1]; h[2] ^= c[0]; h[3] ^= c[1]; + h[4] ^= m[2]; h[5] ^= m[3]; h[6] ^= c[2]; h[7] ^= c[3]; + h[8] ^= m[8]; h[9] ^= m[9]; h[10] ^= c[8]; h[11] ^= c[9]; + h[12] ^= m[10]; h[13] ^= m[11]; h[14] ^= c[10]; h[15] ^= c[11]; + + +#pragma unroll 16 + for (int i = 0; i < 16; i++) + h[i] = cuda_swab32(h[i]); + + *(uint2x4*)&Hash[0] = *(uint2x4*)&h[0]; + *(uint2x4*)&Hash[8] = *(uint2x4*)&h[8]; +} + +__global__ //__launch_bounds__(128, 5) +static void x16_simd512_gpu_compress_64_whirlpool512(uint32_t *g_hash, const uint4 *const __restrict__ g_fft4) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thr_offset = thread << 6; // thr_id * 128 (je zwei elemente) + + const uint32_t P[48] = { + 0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x01425eb8, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x65978b09, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x2cb6b661, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751, 0x9ac2dea3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x579f9f33, 0xfbfbfbfb, 0xfbfbfbfb, 0xefefd3c7, 0xdbfde1dd, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x34514d9e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xb134347e, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968, + 0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af + }; + + __shared__ uint2 sharedMemory[7][256]; + + if (threadIdx.x < 128) + { + const uint2 tmp = __ldg((uint2*)&b0[threadIdx.x]); + const uint2 tmp2 = __ldg((uint2*)&b0[threadIdx.x + 128]); + sharedMemory[0][threadIdx.x] = tmp; + sharedMemory[0][threadIdx.x + 128] = tmp2; + sharedMemory[1][threadIdx.x] = ROL8(tmp); + sharedMemory[1][threadIdx.x + 128] = ROL8(tmp2); + sharedMemory[2][threadIdx.x] = ROL16(tmp); + sharedMemory[2][threadIdx.x + 128] = ROL16(tmp2); + sharedMemory[3][threadIdx.x] = ROL24(tmp); + sharedMemory[3][threadIdx.x + 128] = ROL24(tmp2); + sharedMemory[4][threadIdx.x] = SWAPUINT2(tmp); + sharedMemory[4][threadIdx.x + 128] = SWAPUINT2(tmp2); + sharedMemory[5][threadIdx.x] = ROR24(tmp); + sharedMemory[5][threadIdx.x + 128] = ROR24(tmp2); + sharedMemory[6][threadIdx.x] = ROR16(tmp); + sharedMemory[6][threadIdx.x + 128] = ROR16(tmp2); + } + + + uint32_t k0; + // uint32_t h[16]; + + // if (thread < threads){ + + uint32_t *Hash = &g_hash[thread << 4]; + + uint32_t A[32]; + + *(uint2x4*)&A[0] = *(uint2x4*)&c_IV_512[0] ^ __ldg4((uint2x4*)&Hash[0]); + *(uint2x4*)&A[8] = *(uint2x4*)&c_IV_512[8] ^ __ldg4((uint2x4*)&Hash[8]); + *(uint2x4*)&A[16] = *(uint2x4*)&c_IV_512[16]; + *(uint2x4*)&A[24] = *(uint2x4*)&c_IV_512[24]; + + __syncthreads(); + + SIMD_Compress(A, thr_offset, g_fft4); + + uint2 hash[8], n[8], h[8]; + uint2 tmp[8] = { + { 0xC0EE0B30, 0x672990AF }, { 0x28282828, 0x28282828 }, { 0x28282828, 0x28282828 }, { 0x28282828, 0x28282828 }, + { 0x28282828, 0x28282828 }, { 0x28282828, 0x28282828 }, { 0x28282828, 0x28282828 }, { 0x28282828, 0x28282828 } + }; + + *(uint2x4*)&hash[0] = *((uint2x4*)&A[0]); //__ldg4((uint2x4*)&g_hash[(thread << 3) + 0]); + *(uint2x4*)&hash[4] = *((uint2x4*)&A[8]); //__ldg4((uint2x4*)&g_hash[(thread << 3) + 4]); + + __syncthreads(); + +#pragma unroll 8 + for (int i = 0; i<8; i++) + n[i] = hash[i]; + + tmp[0] ^= d_ROUND_ELT(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1); + tmp[1] ^= d_ROUND_ELT_LDG(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2); + tmp[2] ^= d_ROUND_ELT(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3); + tmp[3] ^= d_ROUND_ELT_LDG(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4); + tmp[4] ^= d_ROUND_ELT(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5); + tmp[5] ^= d_ROUND_ELT_LDG(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6); + tmp[6] ^= d_ROUND_ELT(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7); + tmp[7] ^= d_ROUND_ELT_LDG(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0); + for (int i = 1; i <10; i++){ + TRANSFER(n, tmp); + tmp[0] = d_ROUND_ELT1_LDG(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1, precomputed_round_key_64[(i - 1) * 8 + 0]); + tmp[1] = d_ROUND_ELT1(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2, precomputed_round_key_64[(i - 1) * 8 + 1]); + tmp[2] = d_ROUND_ELT1(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3, precomputed_round_key_64[(i - 1) * 8 + 2]); + tmp[3] = d_ROUND_ELT1_LDG(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4, precomputed_round_key_64[(i - 1) * 8 + 3]); + tmp[4] = d_ROUND_ELT1(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5, precomputed_round_key_64[(i - 1) * 8 + 4]); + tmp[5] = d_ROUND_ELT1(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6, precomputed_round_key_64[(i - 1) * 8 + 5]); + tmp[6] = d_ROUND_ELT1(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7, precomputed_round_key_64[(i - 1) * 8 + 6]); + tmp[7] = d_ROUND_ELT1_LDG(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0, precomputed_round_key_64[(i - 1) * 8 + 7]); + } + + TRANSFER(h, tmp); +#pragma unroll 8 + for (int i = 0; i<8; i++) + hash[i] = h[i] = h[i] ^ hash[i]; + +#pragma unroll 6 + for (int i = 1; i<7; i++) + n[i] = vectorize(0); + + n[0] = vectorize(0x80); + n[7] = vectorize(0x2000000000000); + +#pragma unroll 8 + for (int i = 0; i < 8; i++) { + n[i] = n[i] ^ h[i]; + } + +#pragma unroll 2 + for (int i = 0; i < 10; i++) { + tmp[0] = InitVector_RC[i]; + tmp[0] ^= d_ROUND_ELT(sharedMemory, h, 0, 7, 6, 5, 4, 3, 2, 1); + tmp[1] = d_ROUND_ELT(sharedMemory, h, 1, 0, 7, 6, 5, 4, 3, 2); + tmp[2] = d_ROUND_ELT(sharedMemory, h, 2, 1, 0, 7, 6, 5, 4, 3); + tmp[3] = d_ROUND_ELT(sharedMemory, h, 3, 2, 1, 0, 7, 6, 5, 4); + tmp[4] = d_ROUND_ELT(sharedMemory, h, 4, 3, 2, 1, 0, 7, 6, 5); + tmp[5] = d_ROUND_ELT(sharedMemory, h, 5, 4, 3, 2, 1, 0, 7, 6); + tmp[6] = d_ROUND_ELT(sharedMemory, h, 6, 5, 4, 3, 2, 1, 0, 7); + tmp[7] = d_ROUND_ELT(sharedMemory, h, 7, 6, 5, 4, 3, 2, 1, 0); + TRANSFER(h, tmp); + tmp[0] = d_ROUND_ELT1_LDG(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1, tmp[0]); + tmp[1] = d_ROUND_ELT1_LDG(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2, tmp[1]); + tmp[2] = d_ROUND_ELT1_LDG(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3, tmp[2]); + tmp[3] = d_ROUND_ELT1_LDG(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4, tmp[3]); + tmp[4] = d_ROUND_ELT1_LDG(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5, tmp[4]); + tmp[5] = d_ROUND_ELT1_LDG(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6, tmp[5]); + tmp[6] = d_ROUND_ELT1_LDG(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7, tmp[6]); + tmp[7] = d_ROUND_ELT1_LDG(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0, tmp[7]); + TRANSFER(n, tmp); + } + + hash[0] = xor3x(hash[0], n[0], vectorize(0x80)); + hash[1] = hash[1] ^ n[1]; + hash[2] = hash[2] ^ n[2]; + hash[3] = hash[3] ^ n[3]; + hash[4] = hash[4] ^ n[4]; + hash[5] = hash[5] ^ n[5]; + hash[6] = hash[6] ^ n[6]; + hash[7] = xor3x(hash[7], n[7], vectorize(0x2000000000000)); + + //*(uint2x4*)&g_hash[(thread << 6) + 0] = *(uint2x4*)&hash[0]; + //*(uint2x4*)&g_hash[(thread << 6) + 8] = *(uint2x4*)&hash[4]; + + *(uint2x4*)&Hash[0] = *(uint2x4*)&hash[0]; + *(uint2x4*)&Hash[8] = *(uint2x4*)&hash[4]; +} + @@ -452,7 +1335,7 @@ void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, const dim3 block2(tpb); x11_simd512_gpu_expand_64 <<>> (threads, d_hash, d_temp4[thr_id]); - x11_simd512_gpu_compress_64_maxwell <<< grid2, block2 >>> (threads, d_hash, d_temp4[thr_id]); + x11_simd512_gpu_compress_64 <<< grid2, block2 >>> (threads, d_hash, d_temp4[thr_id]); } __host__ @@ -472,5 +1355,62 @@ void x16_simd_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash const dim3 block2(tpb); x11_simd512_gpu_expand_64 << > > (threads, d_hash, d_temp4[thr_id]); - x16_simd512_gpu_compress_64_maxwell_echo512 << < grid2, block2 >> > (d_hash, d_temp4[thr_id]); + x16_simd512_gpu_compress_64_echo512 << < grid2, block2 >> > (d_hash, d_temp4[thr_id]); +} + +__host__ +void x16_simd_whirlpool512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash) +{ + int dev_id = device_map[thr_id]; + + uint32_t tpb = TPB52_1; + if (device_sm[dev_id] <= 500) tpb = TPB52_1; + const dim3 grid1((8 * threads + tpb - 1) / tpb); + const dim3 block1(tpb); + + tpb = 128; + if (device_sm[dev_id] <= 500) tpb = 128; + const dim3 grid2((threads + tpb - 1) / tpb); + const dim3 block2(tpb); + + x11_simd512_gpu_expand_64 << > > (threads, d_hash, d_temp4[thr_id]); + x16_simd512_gpu_compress_64_whirlpool512 << < grid2, block2 >> > (d_hash, d_temp4[thr_id]); +} + +__host__ +void x16_simd_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash){ + + int dev_id = device_map[thr_id]; + + uint32_t tpb = TPB52_1; + if (device_sm[dev_id] <= 500) tpb = TPB52_1; + const dim3 grid1((8 * threads + tpb - 1) / tpb); + const dim3 block1(tpb); + + tpb = 128; + if (device_sm[dev_id] <= 500) tpb = 128; + const dim3 grid2((threads + tpb - 1) / tpb); + const dim3 block2(tpb); + + x11_simd512_gpu_expand_64 << > > (threads, d_hash, d_temp4[thr_id]); + x16_simd512_gpu_compress_64_hamsi512 << < grid2, block2 >> > (d_hash, d_temp4[thr_id]); +} + +__host__ +void x16_simd_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash){ + + int dev_id = device_map[thr_id]; + + uint32_t tpb = TPB52_1; + if (device_sm[dev_id] <= 500) tpb = TPB52_1; + const dim3 grid1((8 * threads + tpb - 1) / tpb); + const dim3 block1(tpb); + + tpb = 128; + if (device_sm[dev_id] <= 500) tpb = 128; + const dim3 grid2((threads + tpb - 1) / tpb); + const dim3 block2(tpb); + + x11_simd512_gpu_expand_64 << > > (threads, d_hash, d_temp4[thr_id]); + x16_simd512_gpu_compress_64_fugue512 << < grid2, block2 >> > (d_hash, d_temp4[thr_id]); } diff --git a/x16r/x16r.cu b/x16r/x16r.cu index 1e582ca..73a941a 100644 --- a/x16r/x16r.cu +++ b/x16r/x16r.cu @@ -44,6 +44,9 @@ extern void x16_simd_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t extern void x11_cubehash_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash); extern void quark_blake512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_nonceVector, uint32_t *d_outputHash, uint32_t *resNonce, const uint64_t target); extern void x13_fugue512_cpu_hash_64_final_sp(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target); +extern void x16_simd_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash); +extern void x16_simd_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash); +extern void x16_simd_whirlpool512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash); static uint32_t *d_hash[MAX_GPUS]; @@ -311,8 +314,8 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, // ((uint32_t*)pdata)[2] = 0x9A9A9A9A; // ((uint32_t*)pdata)[1] = 0xAAAAAAAA; // ((uint32_t*)pdata)[2] = 0xAAAAAAAA; - ((uint32_t*)pdata)[1] = 0xCCCCCCCC; - ((uint32_t*)pdata)[2] = 0xCCCCCCCC; + ((uint32_t*)pdata)[1] = 0x9B9B9B9B; + ((uint32_t*)pdata)[2] = 0x9B9B9B9B; //((uint8_t*)pdata)[8] = 0x90; // hashOrder[0] = '9'; for simd 80 + blake512 64 //((uint8_t*)pdata)[8] = 0xA0; // hashOrder[0] = 'A'; for echo 80 + blake512 64 @@ -551,6 +554,21 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, x16_simd_echo512_cpu_hash_64(thr_id, throughput,d_hash[thr_id]); i = i + 1; } + else if (nextalgo == WHIRLPOOL) + { + x16_simd_whirlpool512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); + i = i + 1; + } + else if (nextalgo == HAMSI) + { + x16_simd_hamsi512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); + i = i + 1; + } + else if (nextalgo == FUGUE) + { + x16_simd_fugue512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); + i = i + 1; + } else { x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); diff --git a/x16r/x16s.cu b/x16r/x16s.cu index 485598d..4e22314 100644 --- a/x16r/x16s.cu +++ b/x16r/x16s.cu @@ -40,6 +40,9 @@ extern void x16_simd_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t extern void x11_cubehash_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash); extern void quark_blake512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_nonceVector, uint32_t *d_outputHash, uint32_t *resNonce, const uint64_t target); extern void x13_fugue512_cpu_hash_64_final_sp(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target); +extern void x16_simd_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash); +extern void x16_simd_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash); +extern void x16_simd_whirlpool512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash); static uint32_t *d_resNonce[MAX_GPUS]; @@ -315,7 +318,7 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, getAlgoString(&endiandata[1], hashOrder); s_ntime = ntime; - char *temp = "0123456789ABCDEF"; + char *temp = "C9C9C9C9C9C9C9C9"; if (opt_benchmark) { for (int k = 0; k < 16; k++) @@ -536,6 +539,21 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, { x16_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); i = i + 1; + } + else if (nextalgo == WHIRLPOOL) + { + x16_simd_whirlpool512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); + i = i + 1; + } + else if (nextalgo == HAMSI) + { + x16_simd_hamsi512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); + i = i + 1; + } + else if (nextalgo == FUGUE) + { + x16_simd_fugue512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); + i = i + 1; } else {