forked from influxdata/influxdb
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bloom.go
136 lines (116 loc) · 3.72 KB
/
bloom.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
package bloom
// NOTE:
// This package implements a limited bloom filter implementation based on
// Will Fitzgerald's bloom & bitset packages. It uses a zero-allocation xxhash
// implementation, rather than murmur3. It's implemented locally to support
// zero-copy memory-mapped slices.
//
// This also optimizes the filter by always using a bitset size with a power of 2.
import (
"fmt"
"math"
"github.com/cespare/xxhash"
)
// Filter represents a bloom filter.
type Filter struct {
k uint64
b []byte
mask uint64
}
// NewFilter returns a new instance of Filter using m bits and k hash functions.
// If m is not a power of two then it is rounded to the next highest power of 2.
func NewFilter(m uint64, k uint64) *Filter {
m = pow2(m)
return &Filter{k: k, b: make([]byte, m>>3), mask: m - 1}
}
// NewFilterBuffer returns a new instance of a filter using a backing buffer.
// The buffer length MUST be a power of 2.
func NewFilterBuffer(buf []byte, k uint64) (*Filter, error) {
m := pow2(uint64(len(buf)) * 8)
if m != uint64(len(buf))*8 {
return nil, fmt.Errorf("bloom.Filter: buffer bit count must a power of two: %d/%d", len(buf)*8, m)
}
return &Filter{k: k, b: buf, mask: m - 1}, nil
}
// Len returns the number of bits used in the filter.
func (f *Filter) Len() uint { return uint(len(f.b)) }
// K returns the number of hash functions used in the filter.
func (f *Filter) K() uint64 { return f.k }
// Bytes returns the underlying backing slice.
func (f *Filter) Bytes() []byte { return f.b }
// Clone returns a copy of f.
func (f *Filter) Clone() *Filter {
other := &Filter{k: f.k, b: make([]byte, len(f.b)), mask: f.mask}
copy(other.b, f.b)
return other
}
// Insert inserts data to the filter.
func (f *Filter) Insert(v []byte) {
h := f.hash(v)
for i := uint64(0); i < f.k; i++ {
loc := f.location(h, i)
f.b[loc>>3] |= 1 << (loc & 7)
}
}
// Contains returns true if the filter possibly contains v.
// Returns false if the filter definitely does not contain v.
func (f *Filter) Contains(v []byte) bool {
h := f.hash(v)
for i := uint64(0); i < f.k; i++ {
loc := f.location(h, i)
if f.b[loc>>3]&(1<<(loc&7)) == 0 {
return false
}
}
return true
}
// Merge performs an in-place union of other into f.
// Returns an error if m or k of the filters differs.
func (f *Filter) Merge(other *Filter) error {
if other == nil {
return nil
}
// Ensure m & k fields match.
if len(f.b) != len(other.b) {
return fmt.Errorf("bloom.Filter.Merge(): m mismatch: %d <> %d", len(f.b), len(other.b))
} else if f.k != other.k {
return fmt.Errorf("bloom.Filter.Merge(): k mismatch: %d <> %d", f.b, other.b)
}
// Perform union of each byte.
for i := range f.b {
f.b[i] |= other.b[i]
}
return nil
}
// location returns the ith hashed location using two hash values.
func (f *Filter) location(h [2]uint64, i uint64) uint {
return uint((h[0] + h[1]*i) & f.mask)
}
// hash returns two 64-bit hashes based on the output of xxhash.
func (f *Filter) hash(data []byte) [2]uint64 {
v1 := xxhash.Sum64(data)
var v2 uint64
if len(data) > 0 {
b := data[len(data)-1] // We'll put the original byte back.
data[len(data)-1] = byte(0)
v2 = xxhash.Sum64(data)
data[len(data)-1] = b
}
return [2]uint64{v1, v2}
}
// Estimate returns an estimated bit count and hash count given the element count and false positive rate.
func Estimate(n uint64, p float64) (m uint64, k uint64) {
m = uint64(math.Ceil(-1 * float64(n) * math.Log(p) / math.Pow(math.Log(2), 2)))
k = uint64(math.Ceil(math.Log(2) * float64(m) / float64(n)))
return m, k
}
// pow2 returns the number that is the next highest power of 2.
// Returns v if it is a power of 2.
func pow2(v uint64) uint64 {
for i := uint64(8); i < 1<<62; i *= 2 {
if i >= v {
return i
}
}
panic("unreachable")
}