Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Use MurmurHash 3 for String hashing

This gives us a fast hashing algorithm with good distribution.
It also allows for using a seed value to randomize the hash
output so it mitigates hash collision DoS attacks.
  • Loading branch information...
commit 1d69526c484cc9435a7198e41b8995db6c3acf1a 1 parent 4aa59c4
@dbussink dbussink authored
View
2  vm/builtin/bignum.cpp
@@ -1276,7 +1276,7 @@ namespace rubinius {
that unused memory. This might only be a problem if calculations
are leaving cruft in those unused bits. However, since Bignums
are immutable, this shouldn't happen to us. */
- return String::hash_str((unsigned char *)a->dp, a->used * sizeof(mp_digit));
+ return String::hash_str(state, (unsigned char *)a->dp, a->used * sizeof(mp_digit));
}
size_t Bignum::managed_memory_size(STATE) {
View
4 vm/builtin/object.cpp
@@ -344,7 +344,7 @@ namespace rubinius {
hashval Object::hash(STATE) {
if(!reference_p()) {
-#ifdef _LP64
+#ifdef IS_X8664
uintptr_t key = reinterpret_cast<uintptr_t>(this);
key = (~key) + (key << 21); // key = (key << 21) - key - 1;
key = key ^ (key >> 24);
@@ -372,7 +372,7 @@ namespace rubinius {
} else if(Bignum* bignum = try_as<Bignum>(this)) {
return bignum->hash_bignum(state);
} else if(Float* flt = try_as<Float>(this)) {
- return String::hash_str((unsigned char *)(&(flt->val)), sizeof(double));
+ return String::hash_str(state, (unsigned char *)(&(flt->val)), sizeof(double));
} else {
return id(state)->to_native();
}
View
18 vm/builtin/randomizer.cpp
@@ -88,7 +88,7 @@ namespace rubinius {
Randomizer* r = state->new_object<Randomizer>(G(randomizer));
r->lock_ = RBX_SPINLOCK_INIT;
r->rng_data(state, ByteArray::create(state, sizeof(struct random_state)));
-
+ random_init_single(r->rng_state(), 5489UL);
return r;
}
@@ -99,8 +99,6 @@ namespace rubinius {
randomizer->klass(state, cls);
}
- random_init_single(randomizer->rng_state(), 5489UL);
-
return randomizer;
}
@@ -194,5 +192,19 @@ namespace rubinius {
Float* Randomizer::rand_float(STATE) {
return Float::create(state, rb_genrand_real());
}
+
+ /*
+ * Return a random value without depending
+ * on anything externally so it can be used
+ * also outside the context of the VM
+ */
+ uint32_t Randomizer::random_uint32() {
+ uint32_t seed[] = { 0, 0, 0, 0 };
+ random_seed(seed, 4);
+ struct random_state rng;
+ random_init_array(&rng, seed, 4);
+ return random_gen_uint32(&rng);
+ }
+
}
View
2  vm/builtin/randomizer.hpp
@@ -52,6 +52,8 @@ namespace rubinius {
// Rubinius.primitive :randomizer_rand_float
Float* rand_float(STATE);
+ static uint32_t random_uint32();
+
class Info : public TypeInfo {
public:
BASIC_TYPEINFO(TypeInfo)
View
49 vm/builtin/string.cpp
@@ -15,6 +15,8 @@
#include "builtin/symbol.hpp"
#include "builtin/tuple.hpp"
+#include "util/murmur_hash3.hpp"
+
#include "configuration.hpp"
#include "vm.hpp"
#include "object_utils.hpp"
@@ -666,52 +668,21 @@ namespace rubinius {
unsigned char* bp = (unsigned char*)(byte_address());
- hashval h = hash_str(bp, byte_size());
+ hashval h = hash_str(state, bp, byte_size());
hash_value(state, Fixnum::from(h));
return h;
}
- // see http://isthe.com/chongo/tech/comp/fnv/#FNV-param
-#ifdef _LP64
- const static unsigned long FNVOffsetBasis = 14695981039346656037UL;
- const static unsigned long FNVHashPrime = 1099511628211UL;
+ hashval String::hash_str(const unsigned char *bp, unsigned int sz, uint32_t seed) {
+#ifdef IS_X8664
+ hashval hv[2];
+ MurmurHash3_x64_128(bp, sz, seed, hv);
#else
- const static unsigned long FNVOffsetBasis = 2166136261UL;
- const static unsigned long FNVHashPrime = 16777619UL;
+ hashval hv[1];
+ MurmurHash3_x86_32(bp, sz, seed, hv);
#endif
-
- static inline unsigned long update_hash(unsigned long hv,
- unsigned char byte)
- {
- return (hv ^ byte) * FNVHashPrime;
- }
-
- static inline unsigned long finish_hash(unsigned long hv) {
- return (hv>>FIXNUM_WIDTH) ^ (hv & FIXNUM_MAX);
- }
-
- hashval String::hash_str(const char *bp) {
- hashval hv;
-
- hv = FNVOffsetBasis;
-
- while(*bp) {
- hv = update_hash(hv, *bp++);
- }
-
- return finish_hash(hv);
- }
-
- hashval String::hash_str(const unsigned char *bp, unsigned int sz) {
- unsigned char* be = (unsigned char*)bp + sz;
- hashval hv = FNVOffsetBasis;
-
- while(bp < be) {
- hv = update_hash(hv, *bp++);
- }
-
- return finish_hash(hv);
+ return hv[0] & FIXNUM_MAX;
}
Symbol* String::to_sym(STATE) {
View
9 vm/builtin/string.hpp
@@ -92,11 +92,12 @@ namespace rubinius {
static String* create_pinned(STATE, Fixnum* size);
static String* create_reserved(STATE, native_int bytes);
- // Hash the NUL-terminated string _bp_.
- static hashval hash_str(const char *bp);
-
// Hash the byte array _bp_ which contains _sz_ bytes.
- static hashval hash_str(const unsigned char *bp, unsigned int sz);
+ static hashval hash_str(const unsigned char *bp, unsigned int sz, uint32_t seed);
+
+ static hashval hash_str(STATE, const unsigned char *bp, unsigned int sz) {
+ return hash_str(bp, sz, state->hash_seed());
+ }
// Rubinius.primitive :string_equal
Object* equal(STATE, String* other) {
View
2  vm/llvm/state.cpp
@@ -732,7 +732,7 @@ namespace rubinius {
}
Symbol* LLVMState::symbol(const std::string sym) {
- return symbols_.lookup(sym);
+ return symbols_.lookup(&shared_, sym);
}
std::string LLVMState::symbol_debug_str(const Symbol* sym) {
View
5 vm/shared_state.cpp
@@ -15,6 +15,7 @@
#include "agent.hpp"
#include "world_state.hpp"
+#include "builtin/randomizer.hpp"
#ifdef ENABLE_LLVM
#include "llvm/state.hpp"
@@ -38,8 +39,8 @@ namespace rubinius {
, tool_broker_(new tooling::ToolBroker)
, ruby_critical_set_(false)
, check_gc_(false)
-
, om(0)
+
, global_cache(new GlobalCache)
, config(config)
, user_variables(cp)
@@ -50,6 +51,8 @@ namespace rubinius {
for(int i = 0; i < Primitives::cTotalPrimitives; i++) {
primitive_hits_[i] = 0;
}
+
+ hash_seed = Randomizer::random_uint32();
}
SharedState::~SharedState() {
View
1  vm/shared_state.hpp
@@ -102,6 +102,7 @@ namespace rubinius {
SymbolTable symbols;
LLVMState* llvm_state;
Stats stats;
+ uint32_t hash_seed;
public:
SharedState(Environment* env, Configuration& config, ConfigParser& cp);
View
4 vm/state.hpp
@@ -50,6 +50,10 @@ namespace rubinius {
return vm_->symbol(str);
}
+ uint32_t hash_seed() {
+ return shared_.hash_seed;
+ }
+
template <class T>
T* new_object(Class *cls) {
return reinterpret_cast<T*>(vm_->new_object_typed(cls, sizeof(T), T::type));
View
12 vm/symboltable.cpp
@@ -62,7 +62,7 @@ namespace rubinius {
return NULL;
}
- return lookup(str, length);
+ return lookup(str, length, state->hash_seed());
}
struct SpecialOperator {
@@ -96,15 +96,15 @@ namespace rubinius {
return 0;
}
- Symbol* SymbolTable::lookup(std::string str) {
- return lookup(str.data(), str.size());
+ Symbol* SymbolTable::lookup(SharedState* shared, std::string str) {
+ return lookup(str.data(), str.size(), shared->hash_seed);
}
Symbol* SymbolTable::lookup(STATE, std::string str) {
- return lookup(str.data(), str.size());
+ return lookup(str.data(), str.size(), state->hash_seed());
}
- Symbol* SymbolTable::lookup(const char* str, size_t length) {
+ Symbol* SymbolTable::lookup(const char* str, size_t length, uint32_t seed) {
size_t sym;
if(const char* op = find_special(str, length)) {
@@ -112,7 +112,7 @@ namespace rubinius {
length = strlen(str);
}
- hashval hash = String::hash_str((unsigned char*)str, length);
+ hashval hash = String::hash_str((unsigned char*)str, length, seed);
// Symbols can be looked up by multiple threads at the same time.
// This is fast operation, so we protect this with a spinlock.
View
6 vm/symboltable.hpp
@@ -29,6 +29,7 @@ namespace rubinius {
class Array;
class String;
class Symbol;
+ class SharedState;
typedef std::vector<std::string> SymbolStrings;
typedef std::vector<std::size_t> SymbolIds;
@@ -58,14 +59,15 @@ namespace rubinius {
thread::SpinLock lock_;
size_t bytes_used_;
+ Symbol* lookup(const char* str, size_t length, uint32_t seed);
+
public:
size_t& bytes_used() {
return bytes_used_;
}
- Symbol* lookup(std::string str);
+ Symbol* lookup(SharedState* shared, std::string str);
Symbol* lookup(STATE, std::string str);
- Symbol* lookup(const char* str, size_t length);
Symbol* lookup(STATE, const char* str, size_t length);
Symbol* lookup(STATE, String* str);
String* lookup_string(STATE, const Symbol* sym);
View
4 vm/test/test_symboltable.hpp
@@ -74,8 +74,8 @@ class TestSymbolTable : public CxxTest::TestSuite, public VMTest {
const char* str = "__uint_fast64_t";
const char* str2 = "TkIF_MOD";
- TS_ASSERT_EQUALS(String::hash_str((unsigned char*)str, strlen(str)),
- String::hash_str((unsigned char*)str2, strlen(str2)));
+ TS_ASSERT_EQUALS(String::hash_str((unsigned char*)str, strlen(str), 0),
+ String::hash_str((unsigned char*)str2, strlen(str2), 0));
sym = symbols->lookup(state, std::string(str));
sym2 = symbols->lookup(state, std::string(str2));
View
334 vm/util/murmur_hash3.cpp
@@ -0,0 +1,334 @@
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+// Note - The x86 and x64 versions do _not_ produce the same results, as the
+// algorithms are optimized for their respective platforms. You can still
+// compile and run any of them on any platform, but your performance with the
+// non-native version will be less than optimal.
+
+#include "murmur_hash3.hpp"
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER)
+
+#define FORCE_INLINE __forceinline
+
+#include <stdlib.h>
+
+#define ROTL32(x,y) _rotl(x,y)
+#define ROTL64(x,y) _rotl64(x,y)
+
+#define BIG_CONSTANT(x) (x)
+
+// Other compilers
+
+#else // defined(_MSC_VER)
+
+#define FORCE_INLINE __attribute__((always_inline))
+
+inline uint32_t rotl32 ( uint32_t x, int8_t r )
+{
+ return (x << r) | (x >> (32 - r));
+}
+
+inline uint64_t rotl64 ( uint64_t x, int8_t r )
+{
+ return (x << r) | (x >> (64 - r));
+}
+
+#define ROTL32(x,y) rotl32(x,y)
+#define ROTL64(x,y) rotl64(x,y)
+
+#define BIG_CONSTANT(x) (x##LLU)
+
+#endif // !defined(_MSC_VER)
+
+//-----------------------------------------------------------------------------
+// Block read - if your platform needs to do endian-swapping or can only
+// handle aligned reads, do the conversion here
+
+FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i )
+{
+ return p[i];
+}
+
+FORCE_INLINE uint64_t getblock ( const uint64_t * p, int i )
+{
+ return p[i];
+}
+
+//-----------------------------------------------------------------------------
+// Finalization mix - force all bits of a hash block to avalanche
+
+FORCE_INLINE uint32_t fmix ( uint32_t h )
+{
+ h ^= h >> 16;
+ h *= 0x85ebca6b;
+ h ^= h >> 13;
+ h *= 0xc2b2ae35;
+ h ^= h >> 16;
+
+ return h;
+}
+
+//----------
+
+FORCE_INLINE uint64_t fmix ( uint64_t k )
+{
+ k ^= k >> 33;
+ k *= BIG_CONSTANT(0xff51afd7ed558ccd);
+ k ^= k >> 33;
+ k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
+ k ^= k >> 33;
+
+ return k;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_32 ( const void * key, int len,
+ uint32_t seed, void * out )
+{
+ const uint8_t * data = (const uint8_t*)key;
+ const int nblocks = len / 4;
+
+ uint32_t h1 = seed;
+
+ uint32_t c1 = 0xcc9e2d51;
+ uint32_t c2 = 0x1b873593;
+
+ //----------
+ // body
+
+ const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);
+
+ for(int i = -nblocks; i; i++)
+ {
+ uint32_t k1 = getblock(blocks,i);
+
+ k1 *= c1;
+ k1 = ROTL32(k1,15);
+ k1 *= c2;
+
+ h1 ^= k1;
+ h1 = ROTL32(h1,13);
+ h1 = h1*5+0xe6546b64;
+ }
+
+ //----------
+ // tail
+
+ const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
+
+ uint32_t k1 = 0;
+
+ switch(len & 3)
+ {
+ case 3: k1 ^= tail[2] << 16;
+ case 2: k1 ^= tail[1] << 8;
+ case 1: k1 ^= tail[0];
+ k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+ };
+
+ //----------
+ // finalization
+
+ h1 ^= len;
+
+ h1 = fmix(h1);
+
+ *(uint32_t*)out = h1;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_128 ( const void * key, const int len,
+ uint32_t seed, void * out )
+{
+ const uint8_t * data = (const uint8_t*)key;
+ const int nblocks = len / 16;
+
+ uint32_t h1 = seed;
+ uint32_t h2 = seed;
+ uint32_t h3 = seed;
+ uint32_t h4 = seed;
+
+ uint32_t c1 = 0x239b961b;
+ uint32_t c2 = 0xab0e9789;
+ uint32_t c3 = 0x38b34ae5;
+ uint32_t c4 = 0xa1e38b93;
+
+ //----------
+ // body
+
+ const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);
+
+ for(int i = -nblocks; i; i++)
+ {
+ uint32_t k1 = getblock(blocks,i*4+0);
+ uint32_t k2 = getblock(blocks,i*4+1);
+ uint32_t k3 = getblock(blocks,i*4+2);
+ uint32_t k4 = getblock(blocks,i*4+3);
+
+ k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+
+ h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;
+
+ k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
+
+ h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;
+
+ k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
+
+ h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;
+
+ k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
+
+ h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;
+ }
+
+ //----------
+ // tail
+
+ const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
+
+ uint32_t k1 = 0;
+ uint32_t k2 = 0;
+ uint32_t k3 = 0;
+ uint32_t k4 = 0;
+
+ switch(len & 15)
+ {
+ case 15: k4 ^= tail[14] << 16;
+ case 14: k4 ^= tail[13] << 8;
+ case 13: k4 ^= tail[12] << 0;
+ k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
+
+ case 12: k3 ^= tail[11] << 24;
+ case 11: k3 ^= tail[10] << 16;
+ case 10: k3 ^= tail[ 9] << 8;
+ case 9: k3 ^= tail[ 8] << 0;
+ k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
+
+ case 8: k2 ^= tail[ 7] << 24;
+ case 7: k2 ^= tail[ 6] << 16;
+ case 6: k2 ^= tail[ 5] << 8;
+ case 5: k2 ^= tail[ 4] << 0;
+ k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
+
+ case 4: k1 ^= tail[ 3] << 24;
+ case 3: k1 ^= tail[ 2] << 16;
+ case 2: k1 ^= tail[ 1] << 8;
+ case 1: k1 ^= tail[ 0] << 0;
+ k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+ };
+
+ //----------
+ // finalization
+
+ h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
+
+ h1 += h2; h1 += h3; h1 += h4;
+ h2 += h1; h3 += h1; h4 += h1;
+
+ h1 = fmix(h1);
+ h2 = fmix(h2);
+ h3 = fmix(h3);
+ h4 = fmix(h4);
+
+ h1 += h2; h1 += h3; h1 += h4;
+ h2 += h1; h3 += h1; h4 += h1;
+
+ ((uint32_t*)out)[0] = h1;
+ ((uint32_t*)out)[1] = h2;
+ ((uint32_t*)out)[2] = h3;
+ ((uint32_t*)out)[3] = h4;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x64_128 ( const void * key, const int len,
+ const uint32_t seed, void * out )
+{
+ const uint8_t * data = (const uint8_t*)key;
+ const int nblocks = len / 16;
+
+ uint64_t h1 = seed;
+ uint64_t h2 = seed;
+
+ uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
+ uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
+
+ //----------
+ // body
+
+ const uint64_t * blocks = (const uint64_t *)(data);
+
+ for(int i = 0; i < nblocks; i++)
+ {
+ uint64_t k1 = getblock(blocks,i*2+0);
+ uint64_t k2 = getblock(blocks,i*2+1);
+
+ k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
+
+ h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
+
+ k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
+
+ h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
+ }
+
+ //----------
+ // tail
+
+ const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
+
+ uint64_t k1 = 0;
+ uint64_t k2 = 0;
+
+ switch(len & 15)
+ {
+ case 15: k2 ^= uint64_t(tail[14]) << 48;
+ case 14: k2 ^= uint64_t(tail[13]) << 40;
+ case 13: k2 ^= uint64_t(tail[12]) << 32;
+ case 12: k2 ^= uint64_t(tail[11]) << 24;
+ case 11: k2 ^= uint64_t(tail[10]) << 16;
+ case 10: k2 ^= uint64_t(tail[ 9]) << 8;
+ case 9: k2 ^= uint64_t(tail[ 8]) << 0;
+ k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
+
+ case 8: k1 ^= uint64_t(tail[ 7]) << 56;
+ case 7: k1 ^= uint64_t(tail[ 6]) << 48;
+ case 6: k1 ^= uint64_t(tail[ 5]) << 40;
+ case 5: k1 ^= uint64_t(tail[ 4]) << 32;
+ case 4: k1 ^= uint64_t(tail[ 3]) << 24;
+ case 3: k1 ^= uint64_t(tail[ 2]) << 16;
+ case 2: k1 ^= uint64_t(tail[ 1]) << 8;
+ case 1: k1 ^= uint64_t(tail[ 0]) << 0;
+ k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
+ };
+
+ //----------
+ // finalization
+
+ h1 ^= len; h2 ^= len;
+
+ h1 += h2;
+ h2 += h1;
+
+ h1 = fmix(h1);
+ h2 = fmix(h2);
+
+ h1 += h2;
+ h2 += h1;
+
+ ((uint64_t*)out)[0] = h1;
+ ((uint64_t*)out)[1] = h2;
+}
+
+//-----------------------------------------------------------------------------
View
37 vm/util/murmur_hash3.hpp
@@ -0,0 +1,37 @@
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+#ifndef _MURMURHASH3_H_
+#define _MURMURHASH3_H_
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER)
+
+typedef unsigned char uint8_t;
+typedef unsigned long uint32_t;
+typedef unsigned __int64 uint64_t;
+
+// Other compilers
+
+#else // defined(_MSC_VER)
+
+#include <stdint.h>
+
+#endif // !defined(_MSC_VER)
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out );
+
+void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out );
+
+void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out );
+
+//-----------------------------------------------------------------------------
+
+#endif // _MURMURHASH3_H_
View
3  vm/vm.cpp
@@ -237,7 +237,8 @@ namespace rubinius {
}
Symbol* VM::symbol(const char* str) {
- return shared.symbols.lookup(str, strlen(str));
+ State state(this);
+ return shared.symbols.lookup(&state, str, strlen(str));
}
Symbol* VM::symbol(std::string str) {
Please sign in to comment.
Something went wrong with that request. Please try again.