Skip to content

Commit

Permalink
Reduce memory buffers in simd to increase the intensity
Browse files Browse the repository at this point in the history
  • Loading branch information
sp-hash committed Aug 5, 2018
1 parent 6e25c8c commit f462f8d
Show file tree
Hide file tree
Showing 2 changed files with 289 additions and 52 deletions.
163 changes: 137 additions & 26 deletions x16r/x16r.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ extern void x14_shabal512_cpu_hash_64_final_sp(int thr_id, uint32_t threads, uin
extern void x17_sha512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *resNonce, const uint64_t target);
extern void x11_shavite512_cpu_hash_64_sp_final(int thr_id, uint32_t threads, uint32_t *d_hash, const uint64_t target, uint32_t* resNonce);
extern void quark_groestl512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target);
extern void quark_skein512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint64_t target, uint32_t *d_resNonce);
extern void x16_simd_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
extern void x11_cubehash_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
extern void quark_blake512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_nonceVector, uint32_t *d_outputHash, uint32_t *resNonce, const uint64_t target);
Expand Down Expand Up @@ -263,11 +264,59 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const int dev_id = device_map[thr_id];
int intensity = (device_sm[dev_id] > 500 ) ? 20 : 18;
/* int intensity = (device_sm[dev_id] > 500 ) ? 20 : 18;
if (strstr(device_name[dev_id], "GTX 1080")) intensity = 21;
if (strstr(device_name[dev_id], "GTX 1060")) intensity = 19;
if (strstr(device_name[dev_id], "GTX 1050")) intensity = 18;
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
*/
uint32_t default_throughput=1<<19;
bool splitsimd = true;
bool merge = false;
if ((strstr(device_name[dev_id], "1060")) || (strstr(device_name[dev_id], "P106")))
{
default_throughput = (1 << 21);
splitsimd = false;
}
else if ((strstr(device_name[dev_id], "970") || (strstr(device_name[dev_id], "980"))))
{
default_throughput = (1 << 21);
splitsimd = false;
}
else if ((strstr(device_name[dev_id], "1050")))
{
default_throughput = 1 << 20;
splitsimd = false;
}
else if ((strstr(device_name[dev_id], "950")))
{
default_throughput = 1 << 20;
splitsimd = false;
}
else if ((strstr(device_name[dev_id], "960")))
{
default_throughput = 1 << 20;
splitsimd = false;
}
else if ((strstr(device_name[dev_id], "750")))
{
default_throughput = 1 << 20;
splitsimd = false;
}
else if ((strstr(device_name[dev_id], "1070")) || (strstr(device_name[dev_id], "P104")))
{
default_throughput = (1 << 24); //53686272; //1 << 20
merge = true;
}
else if ((strstr(device_name[dev_id], "1080 Ti")) || (strstr(device_name[dev_id], "1080")) || (strstr(device_name[dev_id], "P102")))
{
default_throughput = (1 << 24); //53686272; //1 << 20
merge = true;
}
uint32_t throughput = cuda_default_throughput(thr_id, default_throughput);

if (throughput > (1<<22)) splitsimd = true;

if (init[thr_id])
{
throughput = min(throughput, max_nonce - first_nonce);
Expand All @@ -294,7 +343,14 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,
quark_jh512_cpu_init(thr_id, throughput);
quark_keccak512_cpu_init(thr_id, throughput);
x11_shavite512_cpu_init(thr_id, throughput);
x11_simd512_cpu_init(thr_id, throughput); // 64
if (splitsimd)
{
x11_simd512_cpu_init(thr_id, (throughput >> 4));
}
else
{
x11_simd512_cpu_init(thr_id, (throughput));
}
x13_hamsi512_cpu_init(thr_id, throughput);
x16_echo512_cuda_init(thr_id, throughput);
x16_fugue512_cpu_init(thr_id, throughput);
Expand Down Expand Up @@ -323,10 +379,10 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,
((uint32_t*)ptarget)[7] = 0x003ff;
// ((uint32_t*)pdata)[1] = 0xFEDCBA98;
// ((uint32_t*)pdata)[2] = 0x76543210;
// ((uint32_t*)pdata)[1] = 0x9A9A9A9A;
// ((uint32_t*)pdata)[2] = 0x9A9A9A9A;
((uint32_t*)pdata)[1] = 0x22222222;
((uint32_t*)pdata)[2] = 0x22222222;
((uint32_t*)pdata)[1] = 0x55555555;
((uint32_t*)pdata)[2] = 0x55555555;
// ((uint32_t*)pdata)[1] = 0x01234567;
// ((uint32_t*)pdata)[2] = 0x22222222;
// ((uint32_t*)pdata)[1] = 0x01234567;
// ((uint32_t*)pdata)[2] = 0x89ABCDEF;
//((uint8_t*)pdata)[8] = 0x90; // hashOrder[0] = '9'; for simd 80 + blake512 64
Expand Down Expand Up @@ -541,7 +597,17 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,
quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); order++;
break;
case SKEIN:
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (i == 15)
{
quark_skein512_cpu_hash_64_final(thr_id, throughput, d_hash[thr_id], ((uint64_t *)ptarget)[3], d_resNonce[thr_id]);
CUDA_SAFE_CALL(cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
work->nonces[0] = h_resNonce[thr_id][0];
addstart = true;
}
else
{
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
}
break;
case LUFFA:
if (i == 15)
Expand Down Expand Up @@ -581,29 +647,74 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,
}
break;
case SIMD:
if (nextalgo == ECHO)
{
x16_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id] );
i = i + 1;
}
else if (nextalgo == WHIRLPOOL)
{
x16_simd_whirlpool512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
i = i + 1;
}
else if (nextalgo == HAMSI)
{
x16_simd_hamsi512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
i = i + 1;
}
else if (nextalgo == FUGUE)
if (!splitsimd)
{
x16_simd_fugue512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
i = i + 1;
if (nextalgo == ECHO)
{
x16_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
i = i + 1;
}
else if (nextalgo == WHIRLPOOL)
{
x16_simd_whirlpool512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
i = i + 1;
}
else if (nextalgo == HAMSI)
{
x16_simd_hamsi512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
i = i + 1;
}
else if (nextalgo == FUGUE)
{
x16_simd_fugue512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
i = i + 1;
}
else
{
x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order);
}
}
else
{
x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id],order);
if (nextalgo == ECHO)
{
for (int j = 0; j < 256; j += 16)
{
x16_simd_echo512_cpu_hash_64(thr_id, throughput >> 4, d_hash[thr_id] + (((throughput / 4)*j) / (sizeof(int))));
}
i = i + 1;
}
else if (nextalgo == WHIRLPOOL)
{
for (int j = 0; j < 256; j += 16)
{
x16_simd_whirlpool512_cpu_hash_64(thr_id, throughput >> 4, d_hash[thr_id] + (((throughput / 4)*j) / (sizeof(int))));
}
i = i + 1;
}
else if (nextalgo == HAMSI)
{
for (int j = 0; j < 256; j += 16)
{
x16_simd_hamsi512_cpu_hash_64(thr_id, throughput >> 4, d_hash[thr_id] + (((throughput / 4)*j) / (sizeof(int))));
}
i = i + 1;
}
else if (nextalgo == FUGUE)
{
for (int j = 0; j < 256; j += 16)
{
x16_simd_fugue512_cpu_hash_64(thr_id, throughput >> 4, d_hash[thr_id] + (((throughput / 4)*j) / (sizeof(int))));
}
i = i + 1;
}
else
{
for (int j = 0; j < 256; j += 16)
{
x11_simd512_cpu_hash_64(thr_id, throughput >> 4, pdata[19], NULL, d_hash[thr_id] + (((throughput / 4)*j) / (sizeof(int))),order);
}
}
}
break;
case ECHO:
Expand Down
Loading

0 comments on commit f462f8d

Please sign in to comment.