Skip to content

Commit

Permalink
bytes: improve performance for bytes.Compare on ppc64x
Browse files Browse the repository at this point in the history
This improves the performance for byte.Compare by rewriting
the cmpbody function in runtime/asm_ppc64x.s.  The previous code
had a simple loop which loaded a pair of bytes and compared them,
which is inefficient for long buffers.  The updated function checks
for 8 or 32 byte chunks and then loads and compares double words where
possible.

Because the byte.Compare result indicates greater or less than,
the doubleword loads must take endianness into account, using a
byte reversed load in the little endian case.

Fixes golang#17433

benchmark                                   old ns/op     new ns/op     delta
BenchmarkBytesCompare/8-16                  13.6          7.16          -47.35%
BenchmarkBytesCompare/16-16                 25.7          7.83          -69.53%
BenchmarkBytesCompare/32-16                 38.1          7.78          -79.58%
BenchmarkBytesCompare/64-16                 63.0          10.6          -83.17%
BenchmarkBytesCompare/128-16                112           13.0          -88.39%
BenchmarkBytesCompare/256-16                211           28.1          -86.68%
BenchmarkBytesCompare/512-16                410           38.6          -90.59%
BenchmarkBytesCompare/1024-16               807           60.2          -92.54%
BenchmarkBytesCompare/2048-16               1601          103           -93.57%

Change-Id: I121acc74fcd27c430797647b8d682eb0607c63eb
Reviewed-on: https://go-review.googlesource.com/30949
Reviewed-by: David Chase <drchase@google.com>
  • Loading branch information
laboger authored and mundaym committed Oct 17, 2016
1 parent 7c46f03 commit 1e28dce
Show file tree
Hide file tree
Showing 4 changed files with 225 additions and 48 deletions.
1 change: 1 addition & 0 deletions src/cmd/internal/obj/ppc64/a.out.go
Expand Up @@ -424,6 +424,7 @@ const (
ALSW
ALWAR
ALWSYNC
AMOVDBR
AMOVWBR
AMOVB
AMOVBU
Expand Down
1 change: 1 addition & 0 deletions src/cmd/internal/obj/ppc64/anames.go
Expand Up @@ -125,6 +125,7 @@ var Anames = []string{
"LSW",
"LWAR",
"LWSYNC",
"MOVDBR",
"MOVWBR",
"MOVB",
"MOVBU",
Expand Down
3 changes: 3 additions & 0 deletions src/cmd/internal/obj/ppc64/asm9.go
Expand Up @@ -1513,6 +1513,7 @@ func buildop(ctxt *obj.Link) {

case AMOVHBR:
opset(AMOVWBR, r0)
opset(AMOVDBR, r0)

case ASLBMFEE:
opset(ASLBMFEV, r0)
Expand Down Expand Up @@ -3923,6 +3924,8 @@ func oploadx(ctxt *obj.Link, a obj.As) uint32 {
return OPVCC(31, 790, 0, 0) /* lhbrx */
case AMOVWBR:
return OPVCC(31, 534, 0, 0) /* lwbrx */
case AMOVDBR:
return OPVCC(31, 532, 0, 0) /* ldbrx */
case AMOVHZ:
return OPVCC(31, 279, 0, 0) /* lhzx */
case AMOVHZU:
Expand Down
268 changes: 220 additions & 48 deletions src/runtime/asm_ppc64x.s
Expand Up @@ -812,13 +812,220 @@ eq:
MOVB R3, ret+16(FP)
RET

// Do an efficieint memequal for ppc64
// for reuse where possible.
// Do an efficient memcmp for ppc64le
// R3 = s1 len
// R4 = s2 len
// R5 = s1 addr
// R6 = s2 addr
// R7 = addr of return value
TEXT cmpbodyLE<>(SB),NOSPLIT|NOFRAME,$0-0
MOVD R3,R8 // set up length
CMP R3,R4,CR2 // unequal?
BC 12,8,setuplen // BLT CR2
MOVD R4,R8 // use R4 for comparison len
setuplen:
MOVD R8,CTR // set up loop counter
CMP R8,$8 // only optimize >=8
BLT simplecheck
DCBT (R5) // cache hint
DCBT (R6)
CMP R8,$32 // optimize >= 32
MOVD R8,R9
BLT setup8a // 8 byte moves only
setup32a:
SRADCC $5,R8,R9 // number of 32 byte chunks
MOVD R9,CTR

// Special processing for 32 bytes or longer.
// Loading this way is faster and correct as long as the
// doublewords being compared are equal. Once they
// are found unequal, reload them in proper byte order
// to determine greater or less than.
loop32a:
MOVD 0(R5),R9 // doublewords to compare
MOVD 0(R6),R10 // get 4 doublewords
MOVD 8(R5),R14
MOVD 8(R6),R15
CMPU R9,R10 // bytes equal?
MOVD $0,R16 // set up for cmpne
BNE cmpne // further compare for LT or GT
MOVD 16(R5),R9 // get next pair of doublewords
MOVD 16(R6),R10
CMPU R4,R15 // bytes match?
MOVD $8,R16 // set up for cmpne
BNE cmpne // further compare for LT or GT
MOVD 24(R5),R14 // get next pair of doublewords
MOVD 24(R6),R15
CMPU R9,R10 // bytes match?
MOVD $16,R16 // set up for cmpne
BNE cmpne // further compare for LT or GT
MOVD $-8,R16 // for cmpne, R5,R6 already inc by 32
ADD $32,R5 // bump up to next 32
ADD $32,R6
CMPU R14,R15 // bytes match?
BC 8,2,loop32a // br ctr and cr
BNE cmpne
ANDCC $24,R8,R9 // Any 8 byte chunks?
BEQ leftover // and result is 0
setup8a:
SRADCC $3,R9,R9 // get the 8 byte count
BEQ leftover // shifted value is 0
MOVD R9,CTR // loop count for doublewords
loop8:
MOVDBR (R5+R0),R9 // doublewords to compare
MOVDBR (R6+R0),R10 // LE compare order
ADD $8,R5
ADD $8,R6
CMPU R9,R10 // match?
BC 8,2,loop8 // bt ctr <> 0 && cr
BGT greater
BLT less
leftover:
ANDCC $7,R8,R9 // check for leftover bytes
MOVD R9,CTR // save the ctr
BNE simple // leftover bytes
BC 12,10,equal // test CR2 for length comparison
BC 12,8,less
BR greater
simplecheck:
CMP R8,$0 // remaining compare length 0
BNE simple // do simple compare
BC 12,10,equal // test CR2 for length comparison
BC 12,8,less // 1st len < 2nd len, result less
BR greater // 1st len > 2nd len must be greater
simple:
MOVBZ 0(R5), R9 // get byte from 1st operand
ADD $1,R5
MOVBZ 0(R6), R10 // get byte from 2nd operand
ADD $1,R6
CMPU R9, R10
BC 8,2,simple // bc ctr <> 0 && cr
BGT greater // 1st > 2nd
BLT less // 1st < 2nd
BC 12,10,equal // test CR2 for length comparison
BC 12,9,greater // 2nd len > 1st len
BR less // must be less
cmpne: // only here is not equal
MOVDBR (R5+R16),R8 // reload in reverse order
MOVDBR (R6+R16),R9
CMPU R8,R9 // compare correct endianness
BGT greater // here only if NE
less:
MOVD $-1,R3
MOVD R3,(R7) // return value if A < B
RET
equal:
MOVD $0,(R7) // return value if A == B
RET
greater:
MOVD $1,R3
MOVD R3,(R7) // return value if A > B
RET

// Do an efficient memcmp for ppc64 (BE)
// R3 = s1 len
// R4 = s2 len
// R5 = s1 addr
// R6 = s2 addr
// R7 = addr of return value
TEXT cmpbodyBE<>(SB),NOSPLIT|NOFRAME,$0-0
MOVD R3,R8 // set up length
CMP R3,R4,CR2 // unequal?
BC 12,8,setuplen // BLT CR2
MOVD R4,R8 // use R4 for comparison len
setuplen:
MOVD R8,CTR // set up loop counter
CMP R8,$8 // only optimize >=8
BLT simplecheck
DCBT (R5) // cache hint
DCBT (R6)
CMP R8,$32 // optimize >= 32
MOVD R8,R9
BLT setup8a // 8 byte moves only

setup32a:
SRADCC $5,R8,R9 // number of 32 byte chunks
MOVD R9,CTR
loop32a:
MOVD 0(R5),R9 // doublewords to compare
MOVD 0(R6),R10 // get 4 doublewords
MOVD 8(R5),R14
MOVD 8(R6),R15
CMPU R9,R10 // bytes equal?
BLT less // found to be less
BGT greater // found to be greater
MOVD 16(R5),R9 // get next pair of doublewords
MOVD 16(R6),R10
CMPU R14,R15 // bytes match?
BLT less // found less
BGT greater // found greater
MOVD 24(R5),R14 // get next pair of doublewords
MOVD 24(R6),R15
CMPU R9,R10 // bytes match?
BLT less // found to be less
BGT greater // found to be greater
ADD $32,R5 // bump up to next 32
ADD $32,R6
CMPU R14,R15 // bytes match?
BC 8,2,loop32a // br ctr and cr
BLT less // with BE, byte ordering is
BGT greater // good for compare
ANDCC $24,R8,R9 // Any 8 byte chunks?
BEQ leftover // and result is 0
setup8a:
SRADCC $3,R9,R9 // get the 8 byte count
BEQ leftover // shifted value is 0
MOVD R9,CTR // loop count for doublewords
loop8:
MOVD (R5),R9
MOVD (R6),R10
ADD $8,R5
ADD $8,R6
CMPU R9,R10 // match?
BC 8,2,loop8 // bt ctr <> 0 && cr
BGT greater
BLT less
leftover:
ANDCC $7,R8,R9 // check for leftover bytes
MOVD R9,CTR // save the ctr
BNE simple // leftover bytes
BC 12,10,equal // test CR2 for length comparison
BC 12,8,less
BR greater
simplecheck:
CMP R8,$0 // remaining compare length 0
BNE simple // do simple compare
BC 12,10,equal // test CR2 for length comparison
BC 12,8,less // 1st len < 2nd len, result less
BR greater // same len, must be equal
simple:
MOVBZ 0(R5),R9 // get byte from 1st operand
ADD $1,R5
MOVBZ 0(R6),R10 // get byte from 2nd operand
ADD $1,R6
CMPU R9,R10
BC 8,2,simple // bc ctr <> 0 && cr
BGT greater // 1st > 2nd
BLT less // 1st < 2nd
BC 12,10,equal // test CR2 for length comparison
BC 12,9,greater // 2nd len > 1st len
less:
MOVD $-1,R3
MOVD R3,(R7) // return value if A < B
RET
equal:
MOVD $0,(R7) // return value if A == B
RET
greater:
MOVD $1,R3
MOVD R3,(R7) // return value if A > B
RET

// Do an efficient memequal for ppc64
// R3 = s1
// R4 = s2
// R5 = len
// R9 = return value
// R6, R7 clobbered
TEXT runtime·memeqbody(SB),NOSPLIT|NOFRAME,$0-0
MOVD R5,CTR
CMP R5,$8 // only optimize >=8
Expand Down Expand Up @@ -983,58 +1190,23 @@ TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
MOVD s2_base+16(FP), R6
MOVD s2_len+24(FP), R4
MOVD $ret+32(FP), R7
BR runtime·cmpbody<>(SB)
#ifdef GOARCH_ppc64le
BR cmpbodyLE<>(SB)
#else
BR cmpbodyBE<>(SB)
#endif

TEXT bytes·Compare(SB),NOSPLIT|NOFRAME,$0-56
MOVD s1+0(FP), R5
MOVD s1+8(FP), R3
MOVD s2+24(FP), R6
MOVD s2+32(FP), R4
MOVD $ret+48(FP), R7
BR runtime·cmpbody<>(SB)

// On entry:
// R3 is the length of s1
// R4 is the length of s2
// R5 points to the start of s1
// R6 points to the start of s2
// R7 points to return value (-1/0/1 will be written here)
//
// On exit:
// R5, R6, R8, R9 and R10 are clobbered
TEXT runtime·cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
CMP R5, R6
BEQ samebytes // same starting pointers; compare lengths
SUB $1, R5
SUB $1, R6
MOVD R4, R8
CMP R3, R4
BGE 2(PC)
MOVD R3, R8 // R8 is min(R3, R4)
ADD R5, R8 // R5 is current byte in s1, R8 is last byte in s1 to compare
loop:
CMP R5, R8
BEQ samebytes // all compared bytes were the same; compare lengths
MOVBZU 1(R5), R9
MOVBZU 1(R6), R10
CMP R9, R10
BEQ loop
// bytes differed
MOVD $1, R4
BGT 2(PC)
NEG R4
MOVD R4, (R7)
RET
samebytes:
MOVD $1, R8
CMP R3, R4
BNE 3(PC)
MOVD R0, (R7)
RET
BGT 2(PC)
NEG R8
MOVD R8, (R7)
RET
#ifdef GOARCH_ppc64le
BR cmpbodyLE<>(SB)
#else
BR cmpbodyBE<>(SB)
#endif

TEXT runtime·fastrand(SB), NOSPLIT, $0-4
MOVD g_m(g), R4
Expand Down

0 comments on commit 1e28dce

Please sign in to comment.