From f12759453889e8238576d94957fb7757f9945fa1 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Fri, 16 Apr 2021 15:24:29 -0400 Subject: [PATCH] Use branchless code in generic union2by2 loop Co-authored-by: Daniel Lemire --- benchmark_test.go | 3 +- real_data_benchmark_test.go | 10 +++++ setutil.go | 16 +++++++ setutil_generic.go | 75 +++++++++++++------------------- setutil_test.go | 85 ++++++++++++++++++++++++++++++++++++- 5 files changed, 141 insertions(+), 48 deletions(-) diff --git a/benchmark_test.go b/benchmark_test.go index 01d694a6..51f493f2 100644 --- a/benchmark_test.go +++ b/benchmark_test.go @@ -3,11 +3,12 @@ package roaring import ( "bytes" "fmt" - "github.com/stretchr/testify/require" "math/rand" "runtime" "testing" + "github.com/stretchr/testify/require" + "github.com/willf/bitset" ) diff --git a/real_data_benchmark_test.go b/real_data_benchmark_test.go index 7e47a323..298487af 100644 --- a/real_data_benchmark_test.go +++ b/real_data_benchmark_test.go @@ -175,6 +175,16 @@ func BenchmarkRealDataParOr(b *testing.B) { }) } +func BenchmarkRealDataOr(b *testing.B) { + benchmarkRealDataAggregate(b, func(bitmaps []*Bitmap) uint64 { + t := uint64(0) + for i := 1; i < len(bitmaps); i++ { + t += Or(bitmaps[i-1], bitmaps[i]).GetCardinality() + } + return t + }) +} + func BenchmarkRealDataParHeapOr(b *testing.B) { benchmarkRealDataAggregate(b, func(bitmaps []*Bitmap) uint64 { return ParHeapOr(0, bitmaps...).GetCardinality() diff --git a/setutil.go b/setutil.go index 663c4fa3..184f5255 100644 --- a/setutil.go +++ b/setutil.go @@ -1,5 +1,7 @@ package roaring +import "unsafe" + func equal(a, b []uint16) bool { if len(a) != len(b) { return false @@ -548,3 +550,17 @@ func binarySearch(array []uint16, ikey uint16) int { } return -(low + 1) } + +// compareuint16 compares two number in a branchless manner. +// Returns -1 if s1 < s2, zero otherwise. +func compareuint16(x, y uint16) int { + return (int(x) - int(y)) >> 63 +} + +// uint16SlicePtr returns a pointer at the given slice +// index avoiding bound checks. Use cautiously. +func uint16SlicePtr(slice []uint16, idx uint) *uint16 { + p := unsafe.Pointer(&slice[0]) + indexp := (unsafe.Pointer)(uintptr(p) + 2*uintptr(idx)) + return (*uint16)(indexp) +} diff --git a/setutil_generic.go b/setutil_generic.go index 9edcc902..48cdbc78 100644 --- a/setutil_generic.go +++ b/setutil_generic.go @@ -3,9 +3,6 @@ package roaring func union2by2(set1 []uint16, set2 []uint16, buffer []uint16) int { - pos := 0 - k1 := 0 - k2 := 0 if 0 == len(set2) { buffer = buffer[:len(set1)] copy(buffer, set1[:]) @@ -16,48 +13,34 @@ func union2by2(set1 []uint16, set2 []uint16, buffer []uint16) int { copy(buffer, set2[:]) return len(set2) } - s1 := set1[k1] - s2 := set2[k2] - buffer = buffer[:cap(buffer)] - for { - if s1 < s2 { - buffer[pos] = s1 - pos++ - k1++ - if k1 >= len(set1) { - copy(buffer[pos:], set2[k2:]) - pos += len(set2) - k2 - break - } - s1 = set1[k1] - } else if s1 == s2 { - buffer[pos] = s1 - pos++ - k1++ - k2++ - if k1 >= len(set1) { - copy(buffer[pos:], set2[k2:]) - pos += len(set2) - k2 - break - } - if k2 >= len(set2) { - copy(buffer[pos:], set1[k1:]) - pos += len(set1) - k1 - break - } - s1 = set1[k1] - s2 = set2[k2] - } else { // if (set1[k1]>set2[k2]) - buffer[pos] = s2 - pos++ - k2++ - if k2 >= len(set2) { - copy(buffer[pos:], set1[k1:]) - pos += len(set1) - k1 - break - } - s2 = set2[k2] - } + var s1, s2 uint16 + pos := uint(0) + k1 := uint(0) + k2 := uint(0) + len1 := uint(len(set1)) + len2 := uint(len(set2)) + buffer = buffer[:len1+len2] + for k1 < len1 && k2 < len2 { + s1 = *uint16SlicePtr(set1, k1) + s2 = *uint16SlicePtr(set2, k2) + + sflag := compareuint16(s1, s2) // -1 if s1 < s2, zero otherwise + lflag := compareuint16(s2, s1) // -1 if s2 < s1, zero otherwise + *uint16SlicePtr(buffer, pos) = uint16(-sflag)*s1 + uint16(1+sflag)*s2 + + pos++ + k1 += uint(1 + lflag) + k2 += uint(1 + sflag) + } + if k1 >= len1 { + copy(buffer[pos:], set2[k2:]) + pos += len2 - k2 + return int(pos) + } + if k2 >= len2 { + copy(buffer[pos:], set1[k1:]) + pos += len1 - k1 + return int(pos) } - return pos + return int(pos) } diff --git a/setutil_test.go b/setutil_test.go index de5fee77..e559aefd 100644 --- a/setutil_test.go +++ b/setutil_test.go @@ -3,8 +3,11 @@ package roaring // to run just these tests: go test -run TestSetUtil* import ( - "github.com/stretchr/testify/assert" + "math/rand" + "sort" "testing" + + "github.com/stretchr/testify/assert" ) func TestSetUtilDifference(t *testing.T) { @@ -41,6 +44,19 @@ func TestSetUtilDifference(t *testing.T) { assert.Equal(t, expectedresult, result) } +func TestCompareuint16(t *testing.T) { + assert.Equal(t, 0, compareuint16(42, 42)) + assert.Equal(t, 0, compareuint16(42, 1)) + assert.Equal(t, -1, compareuint16(1, 42)) +} + +func TestUint16SlicePtr(t *testing.T) { + slice := []uint16{42, 41, 1, 2, 3} + for i := range slice { + assert.Equal(t, slice[i], *uint16SlicePtr(slice, uint(i))) + } +} + func TestSetUtilUnion(t *testing.T) { data1 := []uint16{0, 1, 2, 3, 4, 9} data2 := []uint16{2, 3, 4, 5, 8, 9, 11} @@ -136,3 +152,70 @@ func TestSetUtilBinarySearch(t *testing.T) { } } } + +// go test -bench BenchmarkUnion2by2 -run - +func BenchmarkUnion2by2(b *testing.B) { + r := rand.New(rand.NewSource(123456)) + + // this is important: we pre-generate a large amount of randomized + // sorted arrays in order to disable the effects branch prediction, + // making benchmarks against non-branchless implementations + // more realistic. + + sarrsnum := 1024 + sz1 := 1024 + sarrs := make([][]uint16, sarrsnum) + for i := 0; i < sarrsnum; i++ { + sarrs[i] = make([]uint16, sz1) + for j := 0; j < sz1; j++ { + sarrs[i][j] = uint16(r.Intn(MaxUint16)) + } + sort.Sort(uint16Slice(sarrs[i])) + } + + sz2 := 1024 + s2 := make([]uint16, sz2) + + sz3 := 1024 + s3 := make([]uint16, sz3) + + sz4 := 1024 + s4 := make([]uint16, sz4) + + // We are going to populate our arrays with random data. + // Importantly, we need to sort. There might be a few + // duplicates, by random chance, but it should not affect + // results too much. + + for i := 0; i < sz2; i++ { + s2[i] = uint16(r.Intn(MaxUint16)) + } + sort.Sort(uint16Slice(s2)) + + for i := 0; i < sz3; i++ { + s3[i] = uint16(r.Intn(MaxUint16)) + } + sort.Sort(uint16Slice(s3)) + + for i := 0; i < sz4; i++ { + s4[i] = uint16(r.Intn(MaxUint16)) + } + sort.Sort(uint16Slice(s4)) + + buf := make([]uint16, sz1+sz2+sz3+sz4) + + b.Run("union2by2", func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + for i := 0; i < sarrsnum; i++ { + union2by2(sarrs[i], s2, buf) + union2by2(sarrs[i], s3, buf) + union2by2(sarrs[i], s4, buf) + } + } + }) + + // the old, non-branchless implementation for performance + // comparison can be found here: + // https://github.com/RoaringBitmap/roaring/blob/ff33c3b226c3ac033bf1a0b0f3ed647fc9cd2efa/setutil_generic.go +}