Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

moved bloom filter into stasis

git-svn-id: svn+ssh://svn.corp.yahoo.com/yahoo/yrl/labs/pnuts/code/logstore@3896 8dad8b1f-cf64-0410-95b6-bcf113ffbcfe
  • Loading branch information...
commit 0652b0bf848001677c9d2fb94969e66cadd78f36 1 parent 94c427e
sears authored
View
2  CMakeLists.txt
@@ -96,5 +96,5 @@ ENDIF ( "${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" )
#CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/config.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/config.h)
IF ( HAVE_STASIS )
- ADD_LIBRARY(blsm bLSM.cpp diskTreeComponent.cpp memTreeComponent.cpp dataPage.cpp mergeScheduler.cpp tupleMerger.cpp mergeStats.cpp mergeManager.cpp bloomFilter.c)
+ ADD_LIBRARY(blsm bLSM.cpp diskTreeComponent.cpp memTreeComponent.cpp dataPage.cpp mergeScheduler.cpp tupleMerger.cpp mergeStats.cpp mergeManager.cpp)
ENDIF ( HAVE_STASIS )
View
8 bLSM.h
@@ -189,11 +189,11 @@ class bLSM {
bool mightBeOnDisk(dataTuple * t) {
if(tree_c1) {
if(!tree_c1->bloom_filter) { DEBUG("no c1 bloom filter\n"); return true; }
- if(bloom_filter_lookup(tree_c1->bloom_filter, (const char*)t->strippedkey(), t->strippedkeylen())) { DEBUG("in c1\n"); return true; }
+ if(stasis_bloom_filter_lookup(tree_c1->bloom_filter, (const char*)t->strippedkey(), t->strippedkeylen())) { DEBUG("in c1\n"); return true; }
}
if(tree_c1_prime) {
if(!tree_c1_prime->bloom_filter) { DEBUG("no c1' bloom filter\n"); return true; }
- if(bloom_filter_lookup(tree_c1_prime->bloom_filter, (const char*)t->strippedkey(), t->strippedkeylen())) { DEBUG("in c1'\n"); return true; }
+ if(stasis_bloom_filter_lookup(tree_c1_prime->bloom_filter, (const char*)t->strippedkey(), t->strippedkeylen())) { DEBUG("in c1'\n"); return true; }
}
return mightBeAfterMemMerge(t);
}
@@ -202,13 +202,13 @@ class bLSM {
if(tree_c1_mergeable) {
if(!tree_c1_mergeable->bloom_filter) { DEBUG("no c1m bloom filter\n"); return true; }
- if(bloom_filter_lookup(tree_c1_mergeable->bloom_filter, (const char*)t->strippedkey(), t->strippedkeylen())) { DEBUG("in c1m'\n");return true; }
+ if(stasis_bloom_filter_lookup(tree_c1_mergeable->bloom_filter, (const char*)t->strippedkey(), t->strippedkeylen())) { DEBUG("in c1m'\n");return true; }
}
if(tree_c2) {
if(!tree_c2->bloom_filter) { DEBUG("no c2 bloom filter\n"); return true; }
- if(bloom_filter_lookup(tree_c2->bloom_filter, (const char*)t->strippedkey(), t->strippedkeylen())) { DEBUG("in c2\n");return true; }
+ if(stasis_bloom_filter_lookup(tree_c2->bloom_filter, (const char*)t->strippedkey(), t->strippedkeylen())) { DEBUG("in c2\n");return true; }
}
return false;
}
View
159 bloomFilter.c
@@ -1,159 +0,0 @@
-/*
- * bloomFilter.c
- *
- * Copyright 2010-2012 Yahoo! Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * Author: sears
- */
-#include <assert.h>
-#include <math.h>
-#include <stdio.h>
-#include "bloomFilter.h"
-/**
- Variable names:
- m: number of bloom filter bits
- n: number of bloom filter entries
- k: number of hash functions = ln(2) * (m/n)
- c: m/n
- f: false positive rate = (1/2)^k ~= 0.6185)^(m/n) ;
- taking log_0.6185 of both sides: k log_0.6185(1/2) = m/n ;
- applying change of base: k log(1/2) / log(6.128) = m / n
- (but that's not useful; this is:)
-
- f ~= 0.6185 ^ (m/n)
- log_0.6185(f) = m/n
- log(f) / log(0.6185) = m / n
- m = n log f / log 0.6185
- p: probability a given bit is 1 ~= e^(-kn/m)
- */
-
-static uint64_t bloom_filter_calc_num_buckets(uint64_t num_expected_items,
- double false_positive_rate) {
- // m = n log f / log 0.6185
- return ((uint64_t) ceil(((double)num_expected_items) *
- log(false_positive_rate) / log(0.6185)));
- // m = - n ln f / ln 2 ^ 2 = - n ln f / 0.4804 = - n log f / (0.4343 * 0.4804) = -n log f / 0.2086
-}
-static int bloom_filter_calc_num_functions(uint64_t num_expected_items,
- uint64_t num_buckets) {
- // k = ln(2) * (m/n)
- int ret = floor((log(2) / log(exp(1.0)))
- * ((double) num_buckets) / (double) num_expected_items);
- if(ret == 0) {
- return 1;
- } else {
- return ret;
- }
-}
-static double bloom_filter_current_false_positive_rate(uint64_t actual_number_of_items,
- uint64_t num_buckets) {
- // 0.6185^(m/n)
- return pow(0.6185, ((double)num_buckets)/(double)actual_number_of_items);
-}
-
-struct bloom_filter_t {
- uint64_t (*func_a)(const char *, int);
- uint64_t (*func_b)(const char *, int);
- uint64_t num_expected_items;
- double desired_false_positive_rate;
- uint64_t num_buckets;
- uint8_t * buckets;
- uint64_t num_functions;
- uint64_t*result_scratch_space;
- uint64_t actual_number_of_items;
-};
-bloom_filter_t * bloom_filter_create(uint64_t(*func_a)(const char*,int),
- uint64_t(*func_b)(const char*,int),
- uint64_t num_expected_items,
- double false_positive_rate) {
- bloom_filter_t * ret = malloc(sizeof(*ret));
- ret->func_a = func_a;
- ret->func_b = func_b;
- ret->num_expected_items = num_expected_items;
- ret->desired_false_positive_rate = false_positive_rate;
- ret->num_buckets = bloom_filter_calc_num_buckets(ret->num_expected_items, ret->desired_false_positive_rate);
- ret->buckets = calloc((ret->num_buckets / 8) + ((ret->num_buckets % 8 == 0) ? 0 : 1), 1);
- ret->num_functions = bloom_filter_calc_num_functions(ret->num_expected_items, ret->num_buckets);
- ret->result_scratch_space = malloc(sizeof(*ret->result_scratch_space) * ret->num_functions);
- ret->actual_number_of_items = 0;
- return ret;
-}
-void bloom_filter_destroy(bloom_filter_t* bf) {
- free(bf->buckets);
- free(bf->result_scratch_space);
- free(bf);
-}
-// TODO this uses %. It would be better if it used &, but that would potentially double the memory we use. #define a flag.
-static void bloom_filter_calc_functions(bloom_filter_t * bf, uint64_t* results, const char * key, int keylen) {
- uint64_t fa = bf->func_a(key, keylen);
- uint64_t fb = bf->func_b(key, keylen);
-
- results[0] = (fa + fb) % bf->num_buckets;
- for(int i = 1; i < bf->num_functions; i++) {
- results[i] = (results[i-1] + fb ) % bf->num_buckets;
- }
-}
-
-static const uint8_t bloom_filter_bit_masks[] = { 1, 2, 4, 8, 16, 32, 64, 128 };
-static void bloom_filter_set_bit(bloom_filter_t *bf, uint64_t bit) {
- uint64_t array_offset = bit >> 3;
- uint8_t bit_number = bit & 7;
-
- assert(bit < bf->num_buckets);
-
- bf->buckets[array_offset] |= bloom_filter_bit_masks[bit_number];
-
-}
-/**
- @return 0 if the bit is not set, true otherwise.
- */
-static uint8_t bloom_filter_get_bit(bloom_filter_t *bf, uint64_t bit) {
- uint64_t array_offset = bit >> 3;
- uint8_t bit_number = bit & 7;
-
- assert(bit < bf->num_buckets);
-
- return bf->buckets[array_offset] & bloom_filter_bit_masks[bit_number];
-}
-void bloom_filter_insert(bloom_filter_t * bf, const char *key, int len) {
- bloom_filter_calc_functions(bf, bf->result_scratch_space, key, len);
- for(int i = 0; i < bf->num_functions; i++) {
- bloom_filter_set_bit(bf, bf->result_scratch_space[i]);
- }
- bf->actual_number_of_items++;
-}
-int bloom_filter_lookup(bloom_filter_t * bf, const char * key, int len) {
- int ret = 1;
- uint64_t * scratch = malloc(sizeof(*scratch) * bf->num_functions);
- bloom_filter_calc_functions(bf, scratch, key, len);
- for(int i = 0; i < bf->num_functions; i++) {
- ret = ret && bloom_filter_get_bit(bf, scratch[i]);
- }
- free(scratch);
- return ret;
-}
-
-void bloom_filter_print_stats(bloom_filter_t * bf) {
- printf("Design capacity %lld design false positive %f\n"
- "Current item count %lld current false positive %f\n"
- "Number of buckets %lld (%f MB), number of hash functions %lld\n",
- (long long)bf->num_expected_items, bf->desired_false_positive_rate,
- (long long)bf->actual_number_of_items,
- bloom_filter_current_false_positive_rate(bf->actual_number_of_items,
- bf->num_buckets),
- (long long)bf->num_buckets,
- ((double)bf->num_buckets) / (8.0 * 1024.0 * 1024.0),
- (long long)bf->num_functions);
-}
View
50 bloomFilter.h
@@ -1,50 +0,0 @@
-/*
- * bloomFilter.h
- *
- * Copyright 2010-2012 Yahoo! Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * Author: sears
- */
-#ifndef BLOOM_FILTER_H
-#define BLOOM_FILTER_H
-
-#include <stasis/common.h>
-
-BEGIN_C_DECLS
-
-typedef struct bloom_filter_t bloom_filter_t;
-
-/**
- @return 0 if there is not enough memory, or some other error occurred; a
- pointer to the new bloom filter otherwise.
- */
-bloom_filter_t * bloom_filter_create(uint64_t(*hash_func_a)(const char*,int),
- uint64_t(*hash_func_b)(const char*,int),
- uint64_t num_expected_items,
- double false_positive_rate);
-
-void bloom_filter_destroy(bloom_filter_t*);
-
-void bloom_filter_insert(bloom_filter_t * bf, const char* key, int len);
-/**
- @return 1 if the value might be in the bloom filter, 0 otherwise
- */
-int bloom_filter_lookup(bloom_filter_t * bf, const char* key, int len);
-
-void bloom_filter_print_stats(bloom_filter_t * bf);
-
-END_C_DECLS
-
-#endif
View
4 diskTreeComponent.cpp
@@ -79,7 +79,7 @@ void diskTreeComponent::writes_done() {
int diskTreeComponent::insertTuple(int xid, dataTuple *t)
{
if(bloom_filter) {
- bloom_filter_insert(bloom_filter, (const char*)t->strippedkey(), t->strippedkeylen());
+ stasis_bloom_filter_insert(bloom_filter, (const char*)t->strippedkey(), t->strippedkeylen());
}
int ret = 0; // no error.
if(dp==0) {
@@ -135,7 +135,7 @@ dataTuple * diskTreeComponent::findTuple(int xid, dataTuple::key_t key, size_t k
dataTuple * tup=0;
if(bloom_filter) {
- if(!bloom_filter_lookup(bloom_filter, (const char*)key, keySize)) {
+ if(!stasis_bloom_filter_lookup(bloom_filter, (const char*)key, keySize)) {
return NULL;
}
}
View
10 diskTreeComponent.h
@@ -25,7 +25,7 @@
#include "dataPage.h"
#include "dataTuple.h"
#include "mergeStats.h"
-#include "bloomFilter.h"
+#include <stasis/util/bloomFilter.h>
#include <stasis/util/crc32.h>
extern "C" {
@@ -50,10 +50,10 @@ class diskTreeComponent {
stats(stats),
bloom_filter(bloom_filter_size == 0
? 0
- : bloom_filter_create(diskTreeComponent_hash_func_a,
+ : stasis_bloom_filter_create(diskTreeComponent_hash_func_a,
diskTreeComponent_hash_func_b,
bloom_filter_size, 0.01)) {
- if(bloom_filter) bloom_filter_print_stats(bloom_filter);
+ if(bloom_filter) stasis_bloom_filter_print_stats(bloom_filter);
}
diskTreeComponent(int xid, recordid root, recordid internal_node_state,
@@ -65,7 +65,7 @@ class diskTreeComponent {
bloom_filter(0) {}
~diskTreeComponent() {
- if(bloom_filter) bloom_filter_destroy(bloom_filter);
+ if(bloom_filter) stasis_bloom_filter_destroy(bloom_filter);
delete dp;
delete ltree;
}
@@ -208,7 +208,7 @@ class diskTreeComponent {
};
};
- bloom_filter_t * bloom_filter;
+ stasis_bloom_filter_t * bloom_filter;
class iterator
{
View
2  test/CMakeLists.txt
@@ -15,8 +15,6 @@
# limitations under the License.
IF( HAVE_STASIS )
CREATE_CHECK(check_gen)
- CREATE_CHECK(check_bloomFilter)
- CREATE_CHECK(check_testAndSet)
CREATE_CHECK(check_logtree)
CREATE_CHECK(check_datapage)
CREATE_CHECK(check_logtable)
View
116 test/check_bloomFilter.cpp
@@ -1,116 +0,0 @@
-/*
- * check_bloomFilter.cpp
- *
- * Copyright 2010-2012 Yahoo! Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * Created on: Oct 2, 2010
- * Author: sears
- */
-#include <stasis/util/hashFunctions.h>
-#include <bloomFilter.h>
-#include <assert.h>
-#include <stdio.h>
-#include <sys/time.h>
-#include <stasis/util/crc32.h>
-
-/*
- * This file can test CRC and FNV-1 based hash functions. Based on early experiments:
- *
- * CRC32 insert/lookup: 11/13 seconds, 1.1% false positive
- * FNV-1 insert/lookup: 8/9 seconds, 2.8% false positive
- *
- * Expected false positive rate is 1%.
- */
-
-static uint64_t hash_a(const char* a, int len) {
- return stasis_crc32(a,len,0xcafebabe);
-}
-
-static uint64_t hash_b(const char* a, int len) {
- return stasis_crc32(a,len,0xdeadbeef);
-}
-static uint64_t hash_a_fnv(const char* a, int len) {
- return stasis_util_hash_fnv_1_uint32_t((const byte*)a, len);
-}
-static uint64_t hash_b_fnv(const char* a, int len) {
- return stasis_util_hash_fnv_1_uint64_t((const byte*)a, len);
-}
-
-static char * malloc_random_string(int group) {
- char * str = 0;
- int strlen = 0;
- while(!strlen) strlen = 128 + (rand() & 127);
- str = (char*)malloc(strlen + 1);
- str[0] = group;
-
- for(int i = 1; i < strlen; i++) {
- str[i] = (rand() & 128) + 1;
- }
- str[strlen] = 0;
- return str;
-}
-
-int main(int argc, char * argv[]) {
- (void)hash_a; (void)hash_b;
- (void)hash_a_fnv; (void)hash_b_fnv;
-
- const int num_inserts = 1000000;
- char ** strings = (char**)malloc(num_inserts * sizeof(char*));
- uint64_t sum_strlen = 0;
- struct timeval start, stop;
- gettimeofday(&start, 0);
- printf("seed: %lld\n", (long long)start.tv_sec);
- srand(start.tv_sec);
- for(int i = 0; i < num_inserts; i++) {
- strings[i] = malloc_random_string(1);
- sum_strlen += strlen(strings[i]);
- }
- gettimeofday(&stop,0);
- printf("Generated strings in %d seconds. Mean string length: %f\n", (int)(stop.tv_sec - start.tv_sec), (double)(sum_strlen)/(double)num_inserts);
-
- bloom_filter_t * bf = bloom_filter_create(hash_a, hash_b, num_inserts, 0.01);
- bloom_filter_print_stats(bf);
- gettimeofday(&start, 0);
- for(int i = 0; i < num_inserts; i++) {
- bloom_filter_insert(bf,strings[i], strlen(strings[i]));
- }
- gettimeofday(&stop, 0);
- printf("Inserted strings in %d seconds.\n", (int)(stop.tv_sec - start.tv_sec));
-
- gettimeofday(&start, 0);
- for(int i = 0; i < num_inserts; i++) {
- assert(bloom_filter_lookup(bf, strings[i], strlen(strings[i])));
- }
- gettimeofday(&stop, 0);
- printf("Looked up strings in %d seconds.\n", (int)(stop.tv_sec - start.tv_sec));
- bloom_filter_print_stats(bf);
-
- uint64_t false_positives = 0;
- gettimeofday(&start, 0);
- for(int i = 0; i < num_inserts; i++) {
- char * str = malloc_random_string(2);
- if(bloom_filter_lookup(bf, str, strlen(str))) {
- false_positives ++;
- }
- assert(bloom_filter_lookup(bf, strings[i], strlen(strings[i])));
- free(str);
- }
- gettimeofday(&stop, 0);
- printf("Generated and looked up non-existant strings in %d seconds\n"
- "false positive rate was %lf\n", (int)(stop.tv_sec - start.tv_sec),
- ((double)false_positives)/(double)num_inserts);
-
- return 0;
-}
Please sign in to comment.
Something went wrong with that request. Please try again.