Skip to content

Commit

Permalink
Refinements of "def seq" and murmurhash.
Browse files Browse the repository at this point in the history
Trying to make hashcodes faster. Didn't achieve much on that front, so
redirected into structural/consistency issues. The latter was lacking
in terms of how/where "def seq" was defined. The documentation I can
find doesn't give me much hint that the sequential form of my sequential
collection might be a single-use iterator! (As in StringOps, ArrayOps.)
If that's intentional it should be in huge letters. I'm assuming for now
that it wasn't.

Also, there was this:

  GenMapLike:     def seq: Map[A, B]
  GenSetLike:     def seq: Set[A]
  GenSeqLike:     // nothing, returns Traversable

So I added some def seqs where I needed the more specific types for
my hashcode work. Hashcodewise, I broke the MurmurHash3 object into
a reusable class and a collections-specific object, and I deprecated
the methods which took GenTraversableOnce in favor of ones taking
TraversableOnce, because there's no reason the hashcode library should
have to know about things like "make sure to call seq before you
traverse or you'll be sorry." Exclude things by their type and you can
never make a mistake. End transmission.
  • Loading branch information
paulp committed Nov 24, 2011
1 parent 4cfca8a commit 60fb9ec
Show file tree
Hide file tree
Showing 19 changed files with 103 additions and 57 deletions.
3 changes: 1 addition & 2 deletions src/library/scala/collection/GenMapLike.scala
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,9 @@ trait GenMapLike[A, +B, +Repr] extends GenIterableLike[(A, B), Repr] with Equals
def +[B1 >: B](kv: (A, B1)): GenMap[A, B1]
def - (key: A): Repr


// This hash code must be symmetric in the contents but ought not
// collide trivially.
override def hashCode() = util.MurmurHash3.symmetricHash(seq, Map.hashSeed)
override def hashCode() = util.MurmurHash3.mapHash(seq)

/** Returns the value associated with a key, or a default value if the key is not contained in the map.
* @param key the key.
Expand Down
12 changes: 2 additions & 10 deletions src/library/scala/collection/GenSeqLike.scala
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import annotation.bridge
* Unlike iterables, sequences always have a defined order of elements.
*/
trait GenSeqLike[+A, +Repr] extends GenIterableLike[A, Repr] with Equals with Parallelizable[A, parallel.ParSeq[A]] {
def seq: Seq[A]

/** Selects an element by its index in the $coll.
*
Expand Down Expand Up @@ -439,16 +440,7 @@ trait GenSeqLike[+A, +Repr] extends GenIterableLike[A, Repr] with Equals with Pa
/** Hashcodes for $Coll produce a value from the hashcodes of all the
* elements of the $coll.
*/
override def hashCode() = {
import util.MurmurHash3._
var n = 0
var h = Seq.hashSeed
seq foreach {
x => h = mix(h, x.##)
n += 1
}
finalizeHash(h, n)
}
override def hashCode() = util.MurmurHash3.seqHash(seq)

/** The equals method for arbitrary sequences. Compares this sequence to
* some other object.
Expand Down
3 changes: 1 addition & 2 deletions src/library/scala/collection/GenSetLike.scala
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,5 @@ extends GenIterableLike[A, Repr]
// Calling map on a set drops duplicates: any hashcode collisions would
// then be dropped before they can be added.
// Hash should be symmetric in set entries, but without trivial collisions.
override def hashCode() = util.MurmurHash3.symmetricHash(seq, Set.hashSeed)

override def hashCode() = util.MurmurHash3.setHash(seq)
}
1 change: 1 addition & 0 deletions src/library/scala/collection/IndexedSeq.scala
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ trait IndexedSeq[+A] extends Seq[A]
with GenericTraversableTemplate[A, IndexedSeq]
with IndexedSeqLike[A, IndexedSeq[A]] {
override def companion: GenericCompanion[IndexedSeq] = IndexedSeq
override def seq: IndexedSeq[A] = this
}

/** $factoryInfo
Expand Down
6 changes: 3 additions & 3 deletions src/library/scala/collection/IndexedSeqLike.scala
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
** |/ **
\* */



package scala.collection

import generic._
Expand Down Expand Up @@ -42,6 +40,9 @@ import scala.annotation.tailrec
trait IndexedSeqLike[+A, +Repr] extends SeqLike[A, Repr] {
self =>

def seq: IndexedSeq[A]
override def hashCode() = util.MurmurHash3.seqHash(seq) // TODO - can we get faster via "indexedSeqHash" ?

override protected[this] def thisCollection: IndexedSeq[A] = this.asInstanceOf[IndexedSeq[A]]
override protected[this] def toCollection(repr: Repr): IndexedSeq[A] = repr.asInstanceOf[IndexedSeq[A]]

Expand Down Expand Up @@ -96,4 +97,3 @@ trait IndexedSeqLike[+A, +Repr] extends SeqLike[A, Repr] {
result
}
}

1 change: 1 addition & 0 deletions src/library/scala/collection/LinearSeq.scala
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ trait LinearSeq[+A] extends Seq[A]
with GenericTraversableTemplate[A, LinearSeq]
with LinearSeqLike[A, LinearSeq[A]] {
override def companion: GenericCompanion[LinearSeq] = LinearSeq
override def seq: LinearSeq[A] = this
}

/** $factoryInfo
Expand Down
7 changes: 6 additions & 1 deletion src/library/scala/collection/LinearSeqLike.scala
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,16 @@ import scala.util.control.Breaks._
* @tparam A the element type of the $coll
* @tparam Repr the type of the actual $coll containing the elements.
*/
trait LinearSeqLike[+A, +Repr <: LinearSeqLike[A, Repr]] extends SeqLike[A, Repr] { self: Repr =>
trait LinearSeqLike[+A, +Repr <: LinearSeqLike[A, Repr]] extends SeqLike[A, Repr] {
self: Repr =>

override protected[this] def thisCollection: LinearSeq[A] = this.asInstanceOf[LinearSeq[A]]
override protected[this] def toCollection(repr: Repr): LinearSeq[A] = repr.asInstanceOf[LinearSeq[A]]

def seq: LinearSeq[A]

override def hashCode() = util.MurmurHash3.seqHash(seq) // TODO - can we get faster via "linearSeqHash" ?

override /*IterableLike*/
def iterator: Iterator[A] = new AbstractIterator[A] {
var these = self
Expand Down
3 changes: 0 additions & 3 deletions src/library/scala/collection/Map.scala
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,6 @@ trait Map[A, +B] extends Iterable[(A, B)] with GenMap[A, B] with MapLike[A, B, M
* @define coll map
*/
object Map extends MapFactory[Map] {

private[collection] val hashSeed = "Map".hashCode

def empty[A, B]: immutable.Map[A, B] = immutable.Map.empty

/** $mapCanBuildFromInfo */
Expand Down
5 changes: 0 additions & 5 deletions src/library/scala/collection/Seq.scala
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
** |/ **
\* */



package scala.collection

import generic._
Expand All @@ -32,9 +30,6 @@ trait Seq[+A] extends PartialFunction[Int, A]
* @define Coll Seq
*/
object Seq extends SeqFactory[Seq] {

private[collection] val hashSeed = "Seq".hashCode

/** $genericCanBuildFromInfo */
implicit def canBuildFrom[A]: CanBuildFrom[Coll, A, Seq[A]] = ReusableCBF.asInstanceOf[GenericCanBuildFrom[A]]

Expand Down
2 changes: 0 additions & 2 deletions src/library/scala/collection/Set.scala
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,6 @@ trait Set[A] extends (A => Boolean)
* @define Coll Set
*/
object Set extends SetFactory[Set] {
private[collection] val hashSeed = "Set".hashCode

def newBuilder[A] = immutable.Set.newBuilder[A]
override def empty[A]: Set[A] = immutable.Set.empty[A]
implicit def canBuildFrom[A]: CanBuildFrom[Coll, A, Set[A]] = setCanBuildFrom[A]
Expand Down
1 change: 1 addition & 0 deletions src/library/scala/collection/immutable/IndexedSeq.scala
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ trait IndexedSeq[+A] extends Seq[A]
with IndexedSeqLike[A, IndexedSeq[A]] {
override def companion: GenericCompanion[IndexedSeq] = IndexedSeq
override def toIndexedSeq[B >: A]: IndexedSeq[B] = this
override def seq: IndexedSeq[A] = this
}

/** $factoryInfo
Expand Down
1 change: 1 addition & 0 deletions src/library/scala/collection/immutable/LinearSeq.scala
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ trait LinearSeq[+A] extends Seq[A]
with GenericTraversableTemplate[A, LinearSeq]
with LinearSeqLike[A, LinearSeq[A]] {
override def companion: GenericCompanion[LinearSeq] = LinearSeq
override def seq: LinearSeq[A] = this
}

/** $factoryInfo
Expand Down
2 changes: 0 additions & 2 deletions src/library/scala/collection/immutable/Set.scala
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,6 @@ object Set extends ImmutableSetFactory[Set] {
implicit def canBuildFrom[A]: CanBuildFrom[Coll, A, Set[A]] = setCanBuildFrom[A]
override def empty[A]: Set[A] = EmptySet.asInstanceOf[Set[A]]

private val hashSeed = "Set".hashCode

/** An optimized representation for immutable empty sets */
private object EmptySet extends AbstractSet[Any] with Set[Any] with Serializable {
override def size: Int = 0
Expand Down
2 changes: 1 addition & 1 deletion src/library/scala/collection/immutable/StringOps.scala
Original file line number Diff line number Diff line change
Expand Up @@ -48,5 +48,5 @@ final class StringOps(override val repr: String) extends StringLike[String] {
override def toString = repr
override def length = repr.length

def seq = this.iterator
def seq = new WrappedString(repr)
}
2 changes: 1 addition & 1 deletion src/library/scala/collection/mutable/ArrayOps.scala
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ abstract class ArrayOps[T] extends ArrayLike[T, Array[T]] with CustomParalleliza
bb.result
}

def seq = this.iterator
def seq = thisCollection

}

Expand Down
1 change: 1 addition & 0 deletions src/library/scala/collection/mutable/IndexedSeq.scala
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ trait IndexedSeq[A] extends Seq[A]
with GenericTraversableTemplate[A, IndexedSeq]
with IndexedSeqLike[A, IndexedSeq[A]] {
override def companion: GenericCompanion[IndexedSeq] = IndexedSeq
override def seq: IndexedSeq[A] = this
}

/** $factoryInfo
Expand Down
1 change: 1 addition & 0 deletions src/library/scala/collection/mutable/LinearSeq.scala
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ trait LinearSeq[A] extends Seq[A]
with GenericTraversableTemplate[A, LinearSeq]
with LinearSeqLike[A, LinearSeq[A]] {
override def companion: GenericCompanion[LinearSeq] = LinearSeq
override def seq: LinearSeq[A] = this
}

/** $factoryInfo
Expand Down
105 changes: 81 additions & 24 deletions src/library/scala/util/MurmurHash3.scala
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package scala.util

import java.lang.Integer.{ rotateLeft => rotl }


/**
* An implementation of Austin Appleby's MurmurHash 3 algorithm
* (MurmurHash3_x86_32).
Expand All @@ -22,15 +21,7 @@ import java.lang.Integer.{ rotateLeft => rotl }
*
* @see http://code.google.com/p/smhasher
*/
object MurmurHash3 {

// Some arbitrary values used as hash seeds
final val arraySeed = 0x3c074a61
final val stringSeed = 0xf7ca7fd2
final val productSeed = 0xcafebabe
final val symmetricSeed = 0xb592f7ae
final val traversableSeed = 0xe73a8b15

class MurmurHash3 {
/** Mix in a block of data into an intermediate hash value. */
final def mix(hash: Int, data: Int): Int = {
var h = mixLast(hash, data)
Expand Down Expand Up @@ -68,7 +59,7 @@ object MurmurHash3 {
}

/** Compute the hash of a product */
final def productHash(x: Product, seed: Int = productSeed): Int = {
final def productHash(x: Product, seed: Int): Int = {
val arr = x.productArity
// Case objects have the hashCode inlined directly into the
// synthetic hashCode method, but this method should still give
Expand All @@ -88,7 +79,7 @@ object MurmurHash3 {
}

/** Compute the hash of a string */
final def stringHash(str: String, seed: Int = stringSeed): Int = {
final def stringHash(str: String, seed: Int): Int = {
var h = seed
var i = 0
while (i + 1 < str.length) {
Expand All @@ -102,38 +93,39 @@ object MurmurHash3 {

/** Compute a hash that is symmetric in its arguments - that is a hash
* where the order of appearance of elements does not matter.
* This is useful for hashing sets, for example. */
final def symmetricHash[T](xs: collection.GenTraversableOnce[T], seed: Int = symmetricSeed): Int = {
* This is useful for hashing sets, for example.
*/
final def unorderedHash(xs: TraversableOnce[Any], seed: Int): Int = {
var a, b, n = 0
var c = 1
xs.seq.foreach { x =>
xs foreach { x =>
val h = x.##
a += h
b ^= h
if (h != 0) c *= h
n += 1
}

var h = seed
h = mix(h, a)
h = mix(h, b)
h = mixLast(h, c)
finalizeHash(h, n)
}

/** Compute a hash for a traversable (once). */
final def traversableHash[T](xs: collection.GenTraversableOnce[T], seed: Int = traversableSeed): Int = {
/** Compute a hash that depends on the order of its arguments.
*/
final def orderedHash(xs: TraversableOnce[Any], seed: Int): Int = {
var n = 0
var h = seed
xs.seq.foreach { x =>
xs foreach { x =>
h = mix(h, x.##)
n += 1
}
finalizeHash(h, n)
}

/** Compute the hash of an array */
final def arrayHash[@specialized T](a: Array[T], seed: Int = arraySeed): Int = {
/** Compute the hash of an array.
*/
final def arrayHash[@specialized T](a: Array[T], seed: Int): Int = {
var h = seed
var i = 0
while (i < a.length) {
Expand All @@ -144,8 +136,9 @@ object MurmurHash3 {
}

/** Compute the hash of a byte array. Faster than arrayHash, because
* it hashes 4 bytes at once. */
final def bytesHash(data: Array[Byte], seed: Int = arraySeed): Int = {
* it hashes 4 bytes at once.
*/
final def bytesHash(data: Array[Byte], seed: Int): Int = {
var len = data.length
var h = seed

Expand Down Expand Up @@ -176,3 +169,67 @@ object MurmurHash3 {
finalizeHash(h, data.length)
}
}

/**
* An instance of MurmurHash3 with predefined seeds for various
* classes. Used by all the scala collections and case classes.
*/
object MurmurHash3 extends MurmurHash3 {
final val arraySeed = 0x3c074a61
final val stringSeed = 0xf7ca7fd2
final val productSeed = 0xcafebabe
final val symmetricSeed = 0xb592f7ae
final val traversableSeed = 0xe73a8b15
final val seqSeed = "Seq".hashCode
final val mapSeed = "Map".hashCode
final val setSeed = "Set".hashCode

def arrayHash[@specialized T](a: Array[T]): Int = arrayHash(a, arraySeed)
def bytesHash(data: Array[Byte]): Int = bytesHash(data, arraySeed)
def orderedHash(xs: TraversableOnce[Any]): Int = orderedHash(xs, symmetricSeed)
def productHash(x: Product): Int = productHash(x, productSeed)
def stringHash(x: String): Int = stringHash(x, stringSeed)
def unorderedHash(xs: TraversableOnce[Any]): Int = unorderedHash(xs, traversableSeed)

/** To offer some potential for optimization.
*/
def seqHash(xs: collection.Seq[_]): Int = orderedHash(xs, seqSeed)
def mapHash(xs: collection.Map[_, _]): Int = unorderedHash(xs, mapSeed)
def setHash(xs: collection.Set[_]): Int = unorderedHash(xs, setSeed)

/** All this trouble and foreach still appears faster.
* Leaving in place in case someone would like to investigate further.
*/
/**
def linearSeqHash(xs: collection.LinearSeq[_], seed: Int): Int = {
var n = 0
var h = seed
var elems = xs
while (elems.nonEmpty) {
h = mix(h, elems.head.##)
n += 1
elems = elems.tail
}
finalizeHash(h, n)
}
def indexedSeqHash(xs: collection.IndexedSeq[_], seed: Int): Int = {
var n = 0
var h = seed
val len = xs.length
while (n < len) {
h = mix(h, xs(n).##)
n += 1
}
finalizeHash(h, n)
}
*/

@deprecated("Use unorderedHash", "2.10.0")
final def symmetricHash[T](xs: collection.GenTraversableOnce[T], seed: Int = symmetricSeed): Int =
unorderedHash(xs.seq, seed)

@deprecated("Use orderedHash", "2.10.0")
final def traversableHash[T](xs: collection.GenTraversableOnce[T], seed: Int = traversableSeed): Int =
orderedHash(xs.seq, seed)
}
2 changes: 1 addition & 1 deletion src/library/scala/xml/Utility.scala
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ object Utility extends AnyRef with parsing.TokenTests {
* @param children
*/
def hashCode(pre: String, label: String, attribHashCode: Int, scpeHash: Int, children: Seq[Node]) =
scala.util.MurmurHash3.traversableHash(label +: attribHashCode +: scpeHash +: children, pre.##)
scala.util.MurmurHash3.orderedHash(label +: attribHashCode +: scpeHash +: children, pre.##)

def appendQuoted(s: String): String = sbToString(appendQuoted(s, _))

Expand Down

0 comments on commit 60fb9ec

Please sign in to comment.