Skip to content

Commit

Permalink
Neo scrypt 7% faster on the 750ti 4% faster on the 970
Browse files Browse the repository at this point in the history
  • Loading branch information
runestensland committed Jul 25, 2015
1 parent 79f3cd1 commit 3dd9b76
Show file tree
Hide file tree
Showing 2 changed files with 111 additions and 53 deletions.
2 changes: 1 addition & 1 deletion cuda_vector.h
Expand Up @@ -770,7 +770,7 @@ static __device__ __inline__ uint32_t __ldgtoint_unaligned2(const uint8_t *ptr)

#endif

static __forceinline__ __device__ void shift256R2(uint32_t * ret, const uint8 &vec4, uint32_t shift)
static __forceinline__ __device__ void shift256R2(uint32_t * ret, const uint8 &vec4, const uint32_t shift)
{
uint32_t truc = 0, truc2 = cuda_swab32(vec4.s7), truc3 = 0;
asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));
Expand Down
162 changes: 110 additions & 52 deletions neoscrypt/cuda_neoscrypt.cu
Expand Up @@ -346,6 +346,7 @@ static __forceinline__ __host__ void Blake2Shost(uint32_t * inout, const uint32_

V.hi.s4 ^= BLAKE2S_BLOCK_SIZE;


for (int x = 0; x < 10; ++x)
{
BLAKE_Ghost(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inkey);
Expand Down Expand Up @@ -392,56 +393,80 @@ static __forceinline__ __device__ void fastkdf256(int thread, const uint32_t * p
uchar4 bufhelper;
uint8_t A[320],B[288];

((uintx64*)A)[0] = ((uintx64*)password)[0];
((uint816 *)A)[4] = ((uint816 *)password)[0];

((uintx64*)B)[0] = ((uintx64*)password)[0];
((uint48 *)B)[8] = ((uint48 *)password)[0];
((uintx64*)A)[0] = ((uintx64*)password)[0];
((uint816 *)A)[4] = ((uint816 *)password)[0];

uint32_t input[BLAKE2S_BLOCK_SIZE/4]; uint32_t key[BLAKE2S_BLOCK_SIZE / 4]={0};
((uintx64*)B)[0] = ((uintx64*)password)[0];
((uint48 *)B)[8] = ((uint48 *)password)[0];

((uint816*)input)[0] = ((uint816*)input_init)[0];
((uint48*)key)[0] = ((uint48*)key_init)[0];
uint32_t input[BLAKE2S_BLOCK_SIZE/4]; uint32_t key[BLAKE2S_BLOCK_SIZE / 4]={0};

((uint816*)input)[0] = ((uint816*)input_init)[0];
((uint48*)key)[0] = ((uint48*)key_init)[0];

for (int i = 0; i < 32; ++i)
#pragma unroll
for (int i = 0; i < 31; ++i)
{

// Blake2Stest(thread,input, key);

bufhelper = ((uchar4*)input)[0];
#pragma unroll
for (int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) { bufhelper += ((uchar4*)input)[x]; }
bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w;

int qbuf = bufidx/4;
int qbuf = bufidx >> 2;
int rbuf = bufidx&3;
int bitbuf = rbuf << 3;
uint32_t shifted[9];

shift256R2(shifted, ((uint8*)input)[0], bitbuf);

for (int k = 0; k < 9; ++k) {
#pragma unroll
for (int k = 0; k < 9; ++k)
{
((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k];
}

if (bufidx < BLAKE2S_KEY_SIZE) {((uint8*)B)[8] = ((uint8*)B)[0];}
else if (bufidx > FASTKDF_BUFFER_SIZE-BLAKE2S_OUT_SIZE) {((uint8*)B)[0] = ((uint8*)B)[8];}

if (i<31) {
for (int k = 0; k <BLAKE2S_BLOCK_SIZE / 4; k++) {
#pragma unroll
for (int k = 0; k <BLAKE2S_BLOCK_SIZE / 4; k++)
{
((uchar4*)(input))[k] = make_uchar4((A + bufidx)[4 * k], (A + bufidx)[4 * k + 1],
(A + bufidx)[4 * k + 2], (A + bufidx)[4 * k + 3]);
}

for (int k = 0; k <BLAKE2S_KEY_SIZE / 4; k++) {
#pragma unroll
for (int k = 0; k <BLAKE2S_KEY_SIZE / 4; k++)
{
((uchar4*)(key))[k] = make_uchar4((B + bufidx)[4 * k], (B + bufidx)[4 * k + 1],
(B + bufidx)[4 * k + 2], (B + bufidx)[4 * k + 3]);
(B + bufidx)[4 * k + 2], (B + bufidx)[4 * k + 3]);
}
Blake2S((uint32_t*)input, key);
}

}
bufhelper = ((uchar4*)input)[0];
#pragma unroll
for (int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) { bufhelper += ((uchar4*)input)[x]; }
bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w;

int qbuf = bufidx >> 2;
int rbuf = bufidx & 3;
int bitbuf = rbuf << 3;
uint32_t shifted[9];

shift256R2(shifted, ((uint8*)input)[0], bitbuf);

#pragma unroll
for (int k = 0; k < 9; ++k) {
((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k];
}

if (bufidx < BLAKE2S_KEY_SIZE) { ((uint8*)B)[8] = ((uint8*)B)[0]; }
else if (bufidx > FASTKDF_BUFFER_SIZE - BLAKE2S_OUT_SIZE) { ((uint8*)B)[0] = ((uint8*)B)[8]; }


int left = FASTKDF_BUFFER_SIZE - bufidx;
int qleft =left/4;
int qleft = left >> 2;
int rleft =left&3;
for (int k = 0; k < qleft; ++k) { ((uchar4*)output)[k] =
make_uchar4((B + bufidx)[4 * k], (B + bufidx)[4 * k + 1],
Expand Down Expand Up @@ -470,17 +495,19 @@ static __forceinline__ __device__ void fastkdf32( const uint32_t * password, con
((uint816*)A)[4] = ((uint816*)password)[0];
((uintx64*)B)[0] = ((uintx64*)salt)[0];
((uintx64*)B)[1] = ((uintx64*)salt)[0];
uint32_t input[BLAKE2S_BLOCK_SIZE/4]; uint32_t key[BLAKE2S_BLOCK_SIZE/4]={0};
((uint816*)input)[0] = ((uint816*)password)[0];
((uint48*)key)[0] = ((uint48*)salt)[0];
uint32_t input[BLAKE2S_BLOCK_SIZE/4]; uint32_t key[BLAKE2S_BLOCK_SIZE/4]={0};
((uint816*)input)[0] = ((uint816*)password)[0];
((uint48*)key)[0] = ((uint48*)salt)[0];

for (int i = 0; i < 32; ++i)
#pragma unroll
for (int i = 0; i < 31; ++i)
{

Blake2S((uint32_t*)input, key);

bufidx = 0;
bufhelper = ((uchar4*)input)[0];
#pragma unroll
for (int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) { bufhelper += ((uchar4*)input)[x]; }
bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w;
int qbuf = bufidx / 4;
Expand All @@ -490,26 +517,47 @@ uint32_t input[BLAKE2S_BLOCK_SIZE/4]; uint32_t key[BLAKE2S_BLOCK_SIZE/4]={0};

shift256R2(shifted, ((uint8*)input)[0], bitbuf);

for (int k = 0; k < 9; ++k) {
#pragma unroll
for (int k = 0; k < 9; ++k)
{
((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k];
}

if (i<31){
if (bufidx < BLAKE2S_KEY_SIZE) {((uint8*)B)[8] = ((uint8*)B)[0];}
else if (bufidx > FASTKDF_BUFFER_SIZE - BLAKE2S_OUT_SIZE) {((uint8*)B)[0] = ((uint8*)B)[8];}
// MyUnion Test;

for (uint8_t k = 0; k <BLAKE2S_BLOCK_SIZE/4 ; k++) {
((uchar4*)(input))[k] =
make_uchar4((A + bufidx)[4 * k], (A + bufidx)[4 * k + 1], (A + bufidx)[4 * k + 2], (A + bufidx)[4 * k + 3]);
}
for (uint8_t k = 0; k <BLAKE2S_KEY_SIZE / 4; k++) {
((uchar4*)(key))[k] =
make_uchar4((B + bufidx)[4 * k], (B + bufidx)[4 * k + 1], (B + bufidx)[4 * k + 2], (B + bufidx)[4 * k + 3]);
#pragma unroll
for (uint8_t k = 0; k <BLAKE2S_BLOCK_SIZE / 4; k++)
{
((uchar4*)(input))[k] =
make_uchar4((A + bufidx)[4 * k], (A + bufidx)[4 * k + 1], (A + bufidx)[4 * k + 2], (A + bufidx)[4 * k + 3]);
}
#pragma unroll
for (uint8_t k = 0; k <BLAKE2S_KEY_SIZE / 4; k++)
{
((uchar4*)(key))[k] =
make_uchar4((B + bufidx)[4 * k], (B + bufidx)[4 * k + 1], (B + bufidx)[4 * k + 2], (B + bufidx)[4 * k + 3]);
}
}

Blake2S((uint32_t*)input, key);

bufidx = 0;
bufhelper = ((uchar4*)input)[0];
#pragma unroll
for (int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) { bufhelper += ((uchar4*)input)[x]; }
bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w;
int qbuf = bufidx / 4;
int rbuf = bufidx & 3;
int bitbuf = rbuf << 3;
uint32_t shifted[9];

shift256R2(shifted, ((uint8*)input)[0], bitbuf);

#pragma unroll
for (int k = 0; k < 9; ++k)
{
((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k];
}


uchar4 unfucked[1];
unfucked[0] = make_uchar4(B[28 + bufidx], B[29 + bufidx],B[30 + bufidx], B[31 + bufidx]);
Expand Down Expand Up @@ -569,8 +617,7 @@ c += d; b = rotate(b^c, 7); \




static __forceinline__ __device__ uint16 salsa_small_scalar_rnd(const uint16 &X)
__forceinline__ __device__ uint16 salsa_small_scalar_rnd(const uint16 &X)
{
uint16 state = X;
uint32_t t;
Expand All @@ -580,7 +627,7 @@ static __forceinline__ __device__ uint16 salsa_small_scalar_rnd(const uint16 &X)
return(X + state);
}

static __device__ __forceinline__ uint16 chacha_small_parallel_rnd(const uint16 &X)
__device__ __forceinline__ uint16 chacha_small_parallel_rnd(const uint16 &X)
{

uint16 st = X;
Expand Down Expand Up @@ -621,18 +668,25 @@ static __device__ __forceinline__ void neoscrypt_salsa(uint16 *XV)
#define SHIFT 130


__global__ __launch_bounds__(128, 1) void neoscrypt_gpu_hash_k0(int stratum, int threads, uint32_t startNonce)
__global__
#if __CUDA_ARCH__ > 500
__launch_bounds__(128, 2)
#else
__launch_bounds__(128, 3)
#endif
void neoscrypt_gpu_hash_k0(int stratum, int threads, uint32_t startNonce)
{

int thread = (blockDim.x * blockIdx.x + threadIdx.x);
int shift = SHIFT * 16 * thread;
// if (thread < threads)
if (thread < threads)
{
const uint32_t nonce = startNonce + thread;

uint16 X[4];
uint32_t data[80];

#pragma unroll
for (int i = 0; i<20; i++) { ((uint4*)data)[i] = ((uint4 *)c_data)[i]; } //ld.local.v4
data[19] = (stratum) ? cuda_swab32(nonce) : nonce; //freaking morons !!!
data[39] = data[19];
Expand All @@ -645,19 +699,19 @@ __global__ __launch_bounds__(128, 1) void neoscrypt_gpu_hash_k0(int stratum, int
}
}

__global__ __launch_bounds__(128, 1) void neoscrypt_gpu_hash_k01(int threads, uint32_t startNonce)
__global__ __launch_bounds__(128, 2) void neoscrypt_gpu_hash_k01(int threads, uint32_t startNonce)
{

int thread = (blockDim.x * blockIdx.x + threadIdx.x);
int shift = SHIFT * 16 * thread;
// if (thread < threads)
if (thread < threads)
{


uint16 X[4];
((uintx64 *)X)[0]= __ldg32(&(W + shift)[0]);

//#pragma unroll
#pragma unroll
for (int i = 0; i < 128; ++i)
{
neoscrypt_chacha(X);
Expand All @@ -670,16 +724,17 @@ __global__ __launch_bounds__(128, 1) void neoscrypt_gpu_hash_k01(int threads, ui
}
}

__global__ __launch_bounds__(128, 1) void neoscrypt_gpu_hash_k2(int threads, uint32_t startNonce)
__global__ __launch_bounds__(128, 2) void neoscrypt_gpu_hash_k2(int threads, uint32_t startNonce)
{

int thread = (blockDim.x * blockIdx.x + threadIdx.x);
int shift = SHIFT * 16 * thread;
// if (thread < threads)
if (thread < threads)
{
uint16 X[4];
((uintx64 *)X)[0] = __ldg32(&(W + shift)[2048]);

#pragma unroll
for (int t = 0; t < 128; t++)
{
int idx = X[3].lo.s0 & 0x7F;
Expand All @@ -692,10 +747,10 @@ __global__ __launch_bounds__(128, 1) void neoscrypt_gpu_hash_k2(int threads, uin
}
}

__global__ __launch_bounds__(128, 1) void neoscrypt_gpu_hash_k3(int threads, uint32_t startNonce)
__global__ __launch_bounds__(128, 2) void neoscrypt_gpu_hash_k3(int threads, uint32_t startNonce)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
// if (thread < threads)
if (thread < threads)
{

int shift = SHIFT * 16 * thread;
Expand All @@ -704,7 +759,7 @@ __global__ __launch_bounds__(128, 1) void neoscrypt_gpu_hash_k3(int threads, uin

((uintx64*)Z)[0] = __ldg32(&(W + shift)[0]);

//#pragma unroll
#pragma unroll
for (int i = 0; i < 128; ++i)
{
neoscrypt_salsa(Z);
Expand All @@ -716,11 +771,12 @@ __global__ __launch_bounds__(128, 1) void neoscrypt_gpu_hash_k3(int threads, uin
}
}

__global__ __launch_bounds__(128, 1) void neoscrypt_gpu_hash_k4(int stratum,int threads, uint32_t startNonce, uint32_t *nonceVector)

__global__ __launch_bounds__(32, 12) void neoscrypt_gpu_hash_k4(int stratum,int threads, uint32_t startNonce, uint32_t *nonceVector)
{

int thread = (blockDim.x * blockIdx.x + threadIdx.x);
// if (thread < threads)
if (thread < threads)
{
const uint32_t nonce = startNonce + thread;

Expand All @@ -730,11 +786,13 @@ __global__ __launch_bounds__(128, 1) void neoscrypt_gpu_hash_k4(int stratum,int

uint32_t data[80];

#pragma unroll
for (int i = 0; i<20; i++) { ((uint4*)data)[i] = ((uint4 *)c_data)[i]; }
data[19] = (stratum) ? cuda_swab32(nonce) : nonce;
data[39] = data[19];
data[59] = data[19];
((uintx64 *)Z)[0] = __ldg32(&(W + shift)[2048]);
#pragma unroll
for (int t = 0; t < 128; t++)
{
int idx = Z[3].lo.s0 & 0x7F;
Expand Down Expand Up @@ -765,7 +823,7 @@ __host__ uint32_t neoscrypt_cpu_hash_k4(int stratum,int thr_id, int threads, uin
cudaMemset(d_NNonce[thr_id], 0xffffffff, sizeof(uint32_t));


const int threadsperblock = 128;
const int threadsperblock = 32;


dim3 grid((threads + threadsperblock - 1) / threadsperblock);
Expand Down

0 comments on commit 3dd9b76

Please sign in to comment.