Skip to content

Commit

Permalink
Cuda and Linux optimizations
Browse files Browse the repository at this point in the history
* Major optimization on cuda miner (+32% on gtx 1070 8gb, +37% on gtx 1060 3gb, +40% on gtx 950)
* Simple optimization on Linux. Up to 5% depending on the cpu
  • Loading branch information
polyminer1 committed Dec 14, 2018
1 parent 9d434ea commit e189c09
Show file tree
Hide file tree
Showing 27 changed files with 252 additions and 135 deletions.
2 changes: 1 addition & 1 deletion BuildInfo.h
@@ -1,5 +1,5 @@
#pragma once

#define RH_PROJECT_NAME "rhminer"
#define RH_PROJECT_VERSION "1.1.1"
#define RH_PROJECT_VERSION "1.2"

4 changes: 4 additions & 0 deletions CHANGELOG.md
@@ -1,5 +1,9 @@
# rhminer update and bugfix history

Version 1.2 - 13 dec 2018
* Major optimization on cuda miner (+32% on gtx 1070 8gb, +37% on gtx 1060 3gb, +40% on gtx 950)
* Simple optimization on Linux. Up to 5% depending on the cpu

Version 1.1.1 - 6 dec 2018
* Critical Fix for memory corruption on Linux
* Fixed cross server shares submission
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Expand Up @@ -43,7 +43,7 @@ if(RH_DEBUG_TARGET)
set(CMAKE_BUILD_TYPE Debug)
else(RH_DEBUG_TARGET)
message(STATUS "Building ${PROJECT_NAME} for ${RH_CUDA_ARCH} architecture in Release")
add_definitions( -Wall -O2 -fexceptions -Wfatal-errors -Wno-sign-compare -Wno-deprecated-declarations -Wno-unused-variable)
add_definitions( -Wall -Ofast -fexceptions -Wfatal-errors -Wno-sign-compare -Wno-deprecated-declarations -Wno-unused-variable)
set(CMAKE_BUILD_TYPE Release)
add_definitions(-DNDEBUG)
endif(RH_DEBUG_TARGET)
Expand Down
2 changes: 0 additions & 2 deletions MinersLib/Algo/blake2s.cpp
Expand Up @@ -169,7 +169,6 @@ int blake2s_init( blake2s_state *S, const uint8_t outlen )
store48( &P->node_offset, 0 );
P->node_depth = 0;
P->inner_length = 0;
// memset(P->reserved, 0, sizeof(P->reserved) );
memset( P->salt, 0, sizeof( P->salt ) );
memset( P->personal, 0, sizeof( P->personal ) );
return blake2s_init_param( S, P );
Expand All @@ -191,7 +190,6 @@ int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, c
store48( &P->node_offset, 0 );
P->node_depth = 0;
P->inner_length = 0;
// memset(P->reserved, 0, sizeof(P->reserved) );
memset( P->salt, 0, sizeof( P->salt ) );
memset( P->personal, 0, sizeof( P->personal ) );

Expand Down
1 change: 1 addition & 0 deletions MinersLib/Algo/blake2s.h
Expand Up @@ -124,6 +124,7 @@ extern "C" {
uint8_t buf[2 * BLAKE2S_BLOCKBYTES];
size_t buflen;
uint8_t last_node;
uint8_t _pading[7];
} blake2s_state ;
#pragma pack(pop)

Expand Down
2 changes: 1 addition & 1 deletion MinersLib/GenericMinerClient.cpp
Expand Up @@ -213,7 +213,7 @@ void GenericMinerClient::doStratum()
{
if (m_stratumClient->IsWorkTimedOut() /*&& m_stratumClient->isConnected()*/)
{
PrintOut("WorkTimeout reacched. No new work received after %u seconds.\n", g_workTimeout);
PrintOut("WorkTimeout reached. No new work received after %u seconds.\n", g_workTimeout);
if (m_stratumClient.get())
m_stratumClient->CloseConnection();
else
Expand Down
5 changes: 3 additions & 2 deletions MinersLib/Global.cpp
Expand Up @@ -97,7 +97,6 @@ GlobalMiningPreset::GlobalMiningPreset()
}
});
#endif //RH_COMPILE_CPU_ONLY

CmdLineManager::GlobalOptions().RegisterValue("devfee", "General", "Set devfee raward percentage. To disable devfee, simply put 0 here. But, before disabling developer fees, consider that it takes time and energy to maintain, develop and optimize this software. Your help is very appreciated.", [&](const string& val)
{
if (val == "0" || val == "0.0")
Expand All @@ -121,7 +120,6 @@ GlobalMiningPreset::GlobalMiningPreset()
m_devfeePercent = 1.0f;
}
});

CmdLineManager::GlobalOptions().RegisterFlag("list", "General", "List all gpu in the system", [&]()
{
GpuManager::listGPU();
Expand Down Expand Up @@ -262,6 +260,8 @@ bool GlobalMiningPreset::UpdateToDevModeState(string& connectionParams)
if (TimeGetMilliSec() > m_devFeeTimer24hMS)
{
U64 nowMS = TimeGetMilliSec();
rand32_reseed((U32)(nowMS));

m_devFeeTimer24hMS = nowMS + t24H;
m_nextDevFeeTimesMS.clear();
m_totalDevFreeTimeToDayMS = 0;
Expand Down Expand Up @@ -437,5 +437,6 @@ void GlobalMiningPreset::DoPerformanceTest()
for (auto h : hashes)
hashCnt += h;
PrintOut("RandomHash speed is %.2f H/S \n", hashCnt / (float)g_testPerformance);

exit(0);
}
4 changes: 2 additions & 2 deletions MinersLib/Pascal/RandomHashCPUMiner.cpp
Expand Up @@ -115,10 +115,10 @@ void RandomHashCPUMiner::RandomHashCpuKernel(CPUKernelData* kernelData)
//set start nonce here
RandomHash_Search(&m_randomHashArray[kernelData->m_id], (U8*)packageData->m_work1, gid);
#endif
if (RH_swap_u32(*(U32*)packageData->m_work1) <= packageData->m_target)
U32* work = (uint32_t *)packageData->m_work1;
if (RH_swap_u32(*work) <= packageData->m_target)
{
//Swapb256
U32 *work = (uint32_t *)packageData->m_work1;
U32 tmp[4] = {work[0], work[1], work[2], work[3]};
work[0] = RH_swap_u32(work[7]);
work[1] = RH_swap_u32(work[6]);
Expand Down
13 changes: 10 additions & 3 deletions MinersLib/Pascal/RandomHash_Blake2s.h
Expand Up @@ -200,10 +200,17 @@ void CUDA_SYM_DECL(RandomHash_blake2s)(RH_StridePtr roundInput, RH_StridePtr out
P->node_offset[5] = 0;
P->node_depth = 0;
P->inner_length = 0;
memset( P->salt, 0, sizeof( P->salt ) );
memset( P->personal, 0, sizeof( P->personal ) );

memset(&S, 0, sizeof( blake2s_state ) );
#if defined(_WIN32_WINNT) || defined(__CUDA_ARCH__)
RH_memzero_8(P->salt, sizeof( P->salt ))
RH_memzero_8(P->personal, sizeof( P->personal ) );
#else
memset(P->salt, 0, sizeof( P->salt ));
memset(P->personal, 0, sizeof( P->personal ) );
#endif

RH_memzero_of16(&S, sizeof( blake2s_state ) );

for( int i = 0; i < 8; ++i ) S.h[i] = blake2s_IV[i];

uint32_t *p = ( uint32_t * )( P );
Expand Down
10 changes: 6 additions & 4 deletions MinersLib/Pascal/RandomHash_Grindahl512.h
Expand Up @@ -436,8 +436,9 @@ void CUDA_SYM_DECL(RandomHash_Grindahl512)(RH_StridePtr roundInput, RH_StridePtr
//init
RH_ALIGN(64) uint64_t stateBuff[Grindalh_WorkSize];
RH_ALIGN(64) uint64_t tempBuff[Grindalh_WorkSize];
memset(stateBuff, 0, Grindalh_WorkSize * sizeof(uint64_t));
memset(tempBuff, 0, Grindalh_WorkSize * sizeof(uint64_t));
RH_memzero_of8(stateBuff, sizeof(stateBuff));
tempBuff[0] = 0;

uint64_t* state = stateBuff;
uint64_t* temp = tempBuff;

Expand All @@ -457,8 +458,9 @@ void CUDA_SYM_DECL(RandomHash_Grindahl512)(RH_StridePtr roundInput, RH_StridePtr
int32_t padding_size = 16 - int32_t(msgLen & 7);
uint64_t msg_length = (msgLen >> 3) + 1;

U8 pad[16];
memset(pad, 0, sizeof(pad));

RH_ALIGN(64) U8 pad[16];
RH_memzero_16(pad, sizeof(pad));
pad[0] = (U8)0x80;

msg_length = RH_swap_u64(msg_length);
Expand Down
4 changes: 2 additions & 2 deletions MinersLib/Pascal/RandomHash_Haval_5_256.h
Expand Up @@ -275,12 +275,12 @@
HAVAL_STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, HAVAL_INW(25), (0xC1A94FB6)); \
HAVAL_STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, HAVAL_INW(15), (0x409F60C4)); \

typedef struct {
struct RH_ALIGN(64) RH_sph_haval_context {
unsigned char buf[128]; /* first field, for alignment */
uint32_t s0, s1, s2, s3, s4, s5, s6, s7;
unsigned passes;
uint64_t count;
} RH_sph_haval_context;
};


void CUDA_SYM_DECL(RandomHash_Haval_5_256)(RH_StridePtr roundInput, RH_StridePtr output)
Expand Down
2 changes: 1 addition & 1 deletion MinersLib/Pascal/RandomHash_MurMur3_32.h
Expand Up @@ -376,7 +376,7 @@ inline void CUDA_SYM_DECL(MurmurHash3_x86_32_Update_16)(__m128i chunk128, uint32
back_h1 = h1; \
}
#endif

void CUDA_SYM_DECL(MurmurHash3_x86_32_Update)( const uint8_t* data, int len, MurmurHash3_x86_32_State* state)
{
RH_ASSERT(state->idx != 0xDEADBEEF)
Expand Down
14 changes: 6 additions & 8 deletions MinersLib/Pascal/RandomHash_RadioGatun32.h
Expand Up @@ -90,17 +90,15 @@ inline void CUDA_SYM_DECL(RadiogatunRoundFunction)(uint32_t* a, uint32_t* mill,

void CUDA_SYM_DECL(RandomHash_RadioGatun32)(RH_StridePtr roundInput, RH_StridePtr output)
{
RH_ALIGN(64) uint32_t mill[19];
uint32_t a[19];
RH_ALIGN(64) uint32_t belt[13 * 3];
RH_ALIGN(64) uint32_t mill[/*19*/20];
RH_ALIGN(64) uint32_t a[19];
RH_ALIGN(64) uint32_t belt[/*13 * 3*/40];
int32_t len = (int32_t)RH_STRIDE_GET_SIZE(roundInput);
uint32_t *inData = (uint32_t *)RH_STRIDE_GET_DATA(roundInput);
uint32_t blockCount = len / RADIOGATUN32_BLOCK_SIZE;
//init
memset(mill, 0, sizeof(mill));

for (uint32_t i = 0; i < 13; i++)
memset(&belt[i * 3], 0, 3 * sizeof(uint32_t));
RH_memzero_of16(mill, sizeof(mill));
RH_memzero_of16(belt, sizeof(belt));

//finish 0 (pre)
uint32_t pre = len % RADIOGATUN32_BLOCK_SIZE;
Expand All @@ -113,7 +111,7 @@ void CUDA_SYM_DECL(RandomHash_RadioGatun32)(RH_StridePtr roundInput, RH_StridePt
while (blockCount > 0)
{
RH_ALIGN(64) uint32_t data[RADIOGATUN32_BLOCK_SIZE];
memcpy(data, inData, 12);
memcpy(data, inData, RADIOGATUN32_BLOCK_SIZE);
uint32_t i = 0;
while (i < 3)
{
Expand Down
5 changes: 3 additions & 2 deletions MinersLib/Pascal/RandomHash_SHA2_256.h
Expand Up @@ -530,14 +530,15 @@ void CUDA_SYM_DECL(RandomHash_SHA2_256)(RH_StridePtr roundInput, RH_StridePtr ou
}
{
int32_t padindex;
RH_ALIGN(64) uint8_t pad[72];
RH_ALIGN(64) uint8_t pad[/*72*/80];

if (len < 56)
padindex = 56 - len;
else
padindex = 120 - len;

PLATFORM_MEMSET(pad, 0, sizeof(pad));
RH_memzero_of16(pad, sizeof(pad));

pad[0] = 0x80;
bits = ReverseBytesUInt64(bits);
ReadUInt64AsBytesLE(bits, pad+padindex);
Expand Down
4 changes: 2 additions & 2 deletions MinersLib/Pascal/RandomHash_SHA2_512.h
Expand Up @@ -694,7 +694,7 @@ inline void CUDA_SYM_DECL(_RandomHash_SHA2_512)(RH_StridePtr roundInput, RH_Stri
//finish
register uint64_t lowBits, hiBits;
register int32_t padindex;
RH_ALIGN(64) uint8_t pad[255];
RH_ALIGN(64) uint8_t pad[/*255*/256];

lowBits = oriLen << 3;
hiBits = oriLen >> 61;
Expand All @@ -705,7 +705,7 @@ inline void CUDA_SYM_DECL(_RandomHash_SHA2_512)(RH_StridePtr roundInput, RH_Stri
padindex = 239 - (uint32_t)len;

padindex++;
memset(pad, 0, sizeof(pad));
RH_memzero_of16(pad, sizeof(pad));
pad[0] = 0x80;

hiBits = ReverseBytesUInt64(hiBits);
Expand Down
4 changes: 2 additions & 2 deletions MinersLib/Pascal/RandomHash_SHA3_512.h
Expand Up @@ -2620,8 +2620,8 @@ void CUDA_SYM_DECL(_RandomHash_SHA3_512)(RH_StridePtr roundInput, RH_StridePtr o
const uint64_t BlockSize = 200 - (int32_t(hashsize) * 2);

//init
RH_ALIGN(64) uint64_t state[25];
memset(state, 0, 25 * sizeof(uint64_t));
RH_ALIGN(64) uint64_t state[/*25*/26];
RH_memzero_of16(state, sizeof(state));

state[1] = uint64_t(-1);
state[2] = uint64_t(-1);
Expand Down
4 changes: 2 additions & 2 deletions MinersLib/Pascal/RandomHash_Snefru_8_256.h
Expand Up @@ -703,7 +703,7 @@ void CUDA_SYM_DECL(RandomHash_Snefru_8_256)(RH_StridePtr roundInput, RH_StridePt
{
// init
RH_ALIGN(64) uint32_t state[SNEFRU_size];
memset(state, 0, SNEFRU_size * sizeof(uint32_t));
RH_memzero_32(state, sizeof(state));

int32_t len = (int32_t)RH_STRIDE_GET_SIZE(roundInput);
uint64_t bits = len * 8;
Expand All @@ -726,7 +726,7 @@ void CUDA_SYM_DECL(RandomHash_Snefru_8_256)(RH_StridePtr roundInput, RH_StridePt
padindex = SNEFRU_BlockSize - pos - 8;

RH_ALIGN(64) uint32_t pad[SNEFRU_BlockSize*2];
memset(pad, 0, sizeof(pad));
RH_memzero_of16(pad, sizeof(pad));

bits = ReverseBytesUInt64(bits);

Expand Down
3 changes: 2 additions & 1 deletion MinersLib/Pascal/RandomHash_Tiger2_5_192.h
Expand Up @@ -810,7 +810,8 @@ void CUDA_SYM_DECL(RandomHash_Tiger2_5_192)(RH_StridePtr roundInput, RH_StridePt
else
padindex = 120 - len;

memset(pad, 0, sizeof(pad));
RH_memzero_of8(pad, sizeof(pad));

pad[0] = 0x80;
ReadUInt64AsBytesLE(bits, pad + padindex);

Expand Down
5 changes: 3 additions & 2 deletions MinersLib/Pascal/RandomHash_Whirlpool.h
Expand Up @@ -403,7 +403,7 @@ void CUDA_SYM_DECL(RandomHash_WhirlPool)(RH_StridePtr roundInput, RH_StridePtr o
const uint32_t Whirlpool_BlockSize = 64;
const uint32_t Whirlpool_HashSize = 64;
RH_ALIGN(64) uint64_t state[8];
memset(state, 0, sizeof(state));
RH_memzero_64(state, sizeof(state));

//body
int32_t len = (int32_t)RH_STRIDE_GET_SIZE(roundInput);
Expand All @@ -428,7 +428,8 @@ void CUDA_SYM_DECL(RandomHash_WhirlPool)(RH_StridePtr roundInput, RH_StridePtr o
else
padindex = 56 - len;

memset(pad, 0, sizeof(pad));
RH_memzero_of16(pad, sizeof(pad));

pad[0] = 0x80;
bits = ReverseBytesUInt64(bits);
ReadUInt64AsBytesLE(bits, pad+padindex);
Expand Down

0 comments on commit e189c09

Please sign in to comment.