-
Notifications
You must be signed in to change notification settings - Fork 5
/
bloomfilter.rs
141 lines (119 loc) · 3.25 KB
/
bloomfilter.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
use std::{
collections::hash_map::DefaultHasher,
hash::{Hash, Hasher},
num::Wrapping,
};
use bitvec::{bitvec, prelude::Msb0, vec::BitVec};
use fxhash::hash64;
use uuid::Uuid;
/// This is the custom bloomfilter implementation.
/// This is used to figure out if a key exists in the database before ever searching the SSTables.
/// The bloomfilter guarantees that failed membership test means the key does not exist.
pub(crate) struct BloomFilter {
arr: BFInner,
}
impl BloomFilter {
pub(crate) fn new() -> BloomFilter {
Self {
arr: BFInner::new(),
}
}
pub(crate) fn find(&self, id: Uuid) -> bool {
self.arr.find(id)
}
pub(crate) fn add(&mut self, id: Uuid) {
self.arr.add(id);
}
pub(crate) fn delete(&mut self, id: Uuid) {
self.arr.delete(id);
}
}
// Accomodate 10 million entries with a false positive probability of 1e-7.
// But since we only want to check if something does not exist, there is some room for error?
// check here for the values (for 10M IDs) - https://hur.st/bloomfilter/?n=10000000&p=1.0E-7&m=&k=10
const BF_SIZE: usize = 56_166_771;
const NUM_HASHES: usize = 10;
struct BFInner {
inner: BitVec<u8, Msb0>,
size: usize,
}
impl BFInner {
fn new() -> Self {
Self {
inner: bitvec!(u8, Msb0; 0; BF_SIZE),
size: BF_SIZE,
}
}
fn find(&self, id: Uuid) -> bool {
for index in self.hash(id) {
match self.inner.get(index) {
Some(val) => {
if !*val {
return false;
}
}
None => return false,
}
}
true
}
fn calculate_hash(id: Uuid) -> u64 {
let mut s = DefaultHasher::new();
id.hash(&mut s);
s.finish()
}
fn hash_i(&self, id: Uuid, i: usize) -> usize {
let hash1 = Wrapping(hash64(&id));
let hash2 = Wrapping(Self::calculate_hash(id));
// let hash1 = Wrapping(xx::hash64(id));
// let hash2 = Wrapping(sea::hash64(id.as_bytes()));
let hash_mod = Wrapping((i * i) as u64) * hash2;
(hash1 + hash_mod).0 as usize % self.size
}
fn hash(&self, id: Uuid) -> Vec<usize> {
let mut hashes = Vec::new();
for i in 0..NUM_HASHES {
hashes.push(self.hash_i(id, i));
}
hashes
}
fn add(&mut self, id: Uuid) {
for index in self.hash(id) {
self.inner.set(index, true);
}
}
fn delete(&mut self, id: Uuid) {
for index in self.hash(id) {
self.inner.set(index, false);
}
}
}
#[cfg(test)]
mod test {
use uuid::Uuid;
use super::BloomFilter;
#[test]
fn bf_create_test() {
let bf = BloomFilter::new();
}
#[test]
fn bf_add_test() {
let mut bf = BloomFilter::new();
let id = Uuid::new_v4();
bf.add(id);
}
#[test]
fn bf_find_test() {
let mut bf = BloomFilter::new();
let id = Uuid::new_v4();
bf.add(id);
assert!(bf.find(id));
}
#[test]
fn bf_delete_test() {
let mut bf = BloomFilter::new();
let id = Uuid::new_v4();
bf.add(id);
bf.delete(id);
}
}