From 034f6b9452265dcd0e7493ed29971c8864b95c35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BCdiger=20Klaehn?= Date: Thu, 16 Jan 2014 21:39:56 +0100 Subject: [PATCH] SI-6253 HashSet should implement union Implements of HashSet.union that reuses the two trees as much as possible when calculating the union of two sets. This leads to significant performance improvements as well as to much better structural sharing. There is a comprehensive correctness test for union since there was not a single test for HashSet.union before. In addition, there are some tests of the desirable properties of the new implementation (structural sharing and efficiency regarding calls of key.hashCode). The other operations diff and intersect, which are conceptually very similar to union, are also implemented along with comprehensive test cases for both correctness and structural sharing. Note that while it appears that there is some code duplication between the three methods, they are sufficiently different that it is not possible to merge them into one without sacrificing performance. --- .../scala/collection/immutable/HashSet.scala | 525 +++++++++++++++++- test/files/run/t6253a.scala | 64 +++ test/files/run/t6253b.scala | 62 +++ test/files/run/t6253c.scala | 63 +++ 4 files changed, 700 insertions(+), 14 deletions(-) create mode 100644 test/files/run/t6253a.scala create mode 100644 test/files/run/t6253b.scala create mode 100644 test/files/run/t6253c.scala diff --git a/src/library/scala/collection/immutable/HashSet.scala b/src/library/scala/collection/immutable/HashSet.scala index 127b22185a02..67e9d18da7bd 100644 --- a/src/library/scala/collection/immutable/HashSet.scala +++ b/src/library/scala/collection/immutable/HashSet.scala @@ -15,6 +15,7 @@ package immutable import generic._ import scala.collection.parallel.immutable.ParHashSet import scala.collection.GenSet +import scala.annotation.tailrec /** This class implements immutable sets using a hash trie. * @@ -38,7 +39,7 @@ class HashSet[A] extends AbstractSet[A] with CustomParallelizable[A, ParHashSet[A]] with Serializable { - import HashSet.{nullToEmpty, bufferSize} + import HashSet.{nullToEmpty, bufferSize, LeafHashSet} override def companion: GenericCompanion[HashSet] = HashSet @@ -85,8 +86,81 @@ class HashSet[A] extends AbstractSet[A] override def + (elem1: A, elem2: A, elems: A*): HashSet[A] = this + elem1 + elem2 ++ elems + override def union(that: GenSet[A]): HashSet[A] = that match { + case that: HashSet[A] => + val buffer = new Array[HashSet[A]](bufferSize(this.size + that.size)) + nullToEmpty(union0(that, 0, buffer, 0)) + case _ => super.union(that) + } + + override def intersect(that: GenSet[A]): HashSet[A] = that match { + case that: HashSet[A] => + val buffer = new Array[HashSet[A]](bufferSize(this.size min that.size)) + nullToEmpty(intersect0(that, 0, buffer, 0)) + case _ => super.intersect(that) + } + + override def diff(that: GenSet[A]): HashSet[A] = that match { + case that: HashSet[A] => + val buffer = new Array[HashSet[A]](bufferSize(this.size)) + nullToEmpty(diff0(that, 0, buffer, 0)) + case _ => super.diff(that) + } + + /** + * Union with a leaf HashSet at a given level. + * @param that a leaf HashSet + * @param level the depth in the tree. We need this when we have to create a branch node on top of this and that + * @return The union of this and that at the given level. Unless level is zero, the result is not a self-contained + * HashSet but needs to be stored at the correct depth + */ + private[immutable] def union0(that: LeafHashSet[A], level: Int): HashSet[A] = { + // the default implementation is for the empty set, so we just return that + that + } + + /** + * Union with a HashSet at a given level + * @param that a HashSet + * @param level the depth in the tree. We need to keep track of the level to know how deep we are in the tree + * @param buffer a temporary buffer that is used for temporarily storing elements when creating new branch nodes + * @param offset0 the first offset into the buffer in which we are allowed to write + * @return The union of this and that at the given level. Unless level is zero, the result is not a self-contained + * HashSet but needs to be stored at the correct depth + */ + private[immutable] def union0(that: HashSet[A], level: Int, buffer: Array[HashSet[A]], offset0: Int): HashSet[A] = { + // the default implementation is for the empty set, so we just return that + that + } + + /** + * Intersection with another hash set at a given level + * @param level the depth in the tree. We need to keep track of the level to know how deep we are in the tree + * @param buffer a temporary buffer that is used for temporarily storing elements when creating new branch nodes + * @param offset0 the first offset into the buffer in which we are allowed to write + * @return The intersection of this and that at the given level. Unless level is zero, the result is not a + * self-contained HashSet but needs to be stored at the correct depth + */ + private[immutable] def intersect0(that: HashSet[A], level: Int, buffer: Array[HashSet[A]], offset0: Int): HashSet[A] = { + // the default implementation is for the empty set, so we just return the empty set + null + } + + /** + * Diff with another hash set at a given level + * @param level the depth in the tree. We need to keep track of the level to know how deep we are in the tree + * @param buffer a temporary buffer that is used for temporarily storing elements when creating new branch nodes + * @param offset0 the first offset into the buffer in which we are allowed to write + * @return The diff of this and that at the given level. Unless level is zero, the result is not a + * self-contained HashSet but needs to be stored at the correct depth + */ + private[immutable] def diff0(that: HashSet[A], level: Int, buffer: Array[HashSet[A]], offset0: Int): HashSet[A] = { + // the default implementation is for the empty set, so we just return the empty set + null + } + def - (e: A): HashSet[A] = - removed0(e, computeHash(e), 0) + nullToEmpty(removed0(e, computeHash(e), 0)) override def filter(p: A => Boolean) = { val buffer = new Array[HashSet[A]](bufferSize(size)) @@ -165,7 +239,14 @@ object HashSet extends ImmutableSetFactory[HashSet] { } } - class HashSet1[A](private[HashSet] val key: A, private[HashSet] val hash: Int) extends HashSet[A] { + /** + * Common superclass of HashSet1 and HashSetCollision1, which are the two possible leaves of the Trie + */ + private[HashSet] sealed abstract class LeafHashSet[A] extends HashSet[A] { + private[HashSet] def hash:Int + } + + class HashSet1[A](private[HashSet] val key: A, private[HashSet] val hash: Int) extends LeafHashSet[A] { override def size = 1 override def get0(key: A, hash: Int, level: Int): Boolean = @@ -190,8 +271,42 @@ object HashSet extends ImmutableSetFactory[HashSet] { } } + override private[immutable] def union0(that: LeafHashSet[A], level: Int): HashSet[A] = that match { + case that if that.hash != this.hash => + // different hash code, so there is no need to investigate further. + // Just create a branch node containing the two. + makeHashTrieSet(this.hash, this, that.hash, that, level) + case that: HashSet1[A] => + if (this.key == that.key) { + this + } else { + // 32-bit hash collision (rare, but not impossible) + new HashSetCollision1[A](hash, ListSet.empty + this.key + that.key) + } + case that: HashSetCollision1[A] => + val ks1 = that.ks + key + // Could use eq check (faster) if ListSet was guaranteed to return itself + if (ks1.size == that.ks.size) { + that + } else { + new HashSetCollision1[A](hash, ks1) + } + } + + override private[immutable] def union0(that: HashSet[A], level: Int, buffer: Array[HashSet[A]], offset0: Int) = { + // switch to the Leaf version of union + // we can exchange the arguments because union is symmetrical + that.union0(this, level) + } + + override private[immutable] def intersect0(that: HashSet[A], level: Int, buffer: Array[HashSet[A]], offset0: Int): HashSet[A] = + if (that.get0(key, hash, level)) this else null + + override private[immutable] def diff0(that: HashSet[A], level: Int, buffer: Array[HashSet[A]], offset0: Int): HashSet[A] = + if (that.get0(key, hash, level)) null else this + override def removed0(key: A, hash: Int, level: Int): HashSet[A] = - if (hash == this.hash && key == this.key) HashSet.empty[A] else this + if (hash == this.hash && key == this.key) null else this override protected def filter0(p: A => Boolean, negate: Boolean, level: Int, buffer: Array[HashSet[A]], offset0: Int): HashSet[A] = if (negate ^ p(key)) this else null @@ -200,8 +315,7 @@ object HashSet extends ImmutableSetFactory[HashSet] { override def foreach[U](f: A => U): Unit = f(key) } - private[immutable] class HashSetCollision1[A](private[HashSet] val hash: Int, val ks: ListSet[A]) - extends HashSet[A] { + private[immutable] class HashSetCollision1[A](private[HashSet] val hash: Int, val ks: ListSet[A]) extends LeafHashSet[A] { override def size = ks.size @@ -220,15 +334,116 @@ object HashSet extends ImmutableSetFactory[HashSet] { if (hash == this.hash) new HashSetCollision1(hash, ks + key) else makeHashTrieSet(this.hash, this, hash, new HashSet1(key, hash), level) + override def union0(that: LeafHashSet[A], level: Int): HashSet[A] = that match { + case that if that.hash != this.hash => + // different hash code, so there is no need to investigate further. + // Just create a branch node containing the two. + makeHashTrieSet(this.hash, this, that.hash, that, level) + case that: HashSet1[A] => + val ks1 = ks + that.key + // Could use eq check (faster) if ListSet was guaranteed to return itself + if (ks1.size == ks.size) { + this + } else { + // create a new HashSetCollision with the existing hash + // we don't have to check for size=1 because union is never going to remove elements + new HashSetCollision1[A](hash, ks1) + } + case that: HashSetCollision1[A] => + val ks1 = this.ks ++ that.ks + ks1.size match { + case size if size == this.ks.size => + // could this check be made faster by doing an eq check? + // I am not sure we can rely on ListSet returning itself when all elements are already in the set, + // so it seems unwise to rely on it. + this + case size if size == that.ks.size => + // we have to check this as well, since we don't want to create a new instance if this is a subset of that + that + case _ => + // create a new HashSetCollision with the existing hash + // we don't have to check for size=1 because union is never going to remove elements + new HashSetCollision1[A](hash, ks1) + } + } + + override def union0(that: HashSet[A], level: Int, buffer: Array[HashSet[A]], offset0: Int): HashSet[A] = that match { + case that: LeafHashSet[A] => + // switch to the simpler Tree/Leaf implementation + this.union0(that, level) + case that: HashTrieSet[A] => + // switch to the simpler Tree/Leaf implementation + // we can swap this and that because union is symmetrical + that.union0(this, level) + case _ => this + } + + override private[immutable] def intersect0(that: HashSet[A], level: Int, buffer: Array[HashSet[A]], offset0: Int): HashSet[A] = { + // filter the keys, taking advantage of the fact that we know their hash code + val ks1 = ks.filter(that.get0(_, hash, level)) + ks1.size match { + case 0 => + // the empty set + null + case size if size == this.size => + // unchanged + // We do this check first since even if the result is of size 1 since + // it is preferable to return the existing set for better structural sharing + this + case size if size == that.size => + // the other set + // We do this check first since even if the result is of size 1 since + // it is preferable to return the existing set for better structural sharing + that + case 1 => + // create a new HashSet1 with the hash we already know + new HashSet1(ks1.head, hash) + case _ => + // create a new HashSetCollison with the hash we already know and the new keys + new HashSetCollision1(hash, ks1) + } + } + + override private[immutable] def diff0(that: HashSet[A], level: Int, buffer: Array[HashSet[A]], offset0: Int): HashSet[A] = { + val ks1 = ks.filterNot(that.get0(_, hash, level)) + ks1.size match { + case 0 => + // the empty set + null + case size if size == this.size => + // unchanged + // We do this check first since even if the result is of size 1 since + // it is preferable to return the existing set for better structural sharing + this + case 1 => + // create a new HashSet1 with the hash we already know + new HashSet1(ks1.head, hash) + case _ => + // create a new HashSetCollison with the hash we already know and the new keys + new HashSetCollision1(hash, ks1) + } + } + override def removed0(key: A, hash: Int, level: Int): HashSet[A] = if (hash == this.hash) { val ks1 = ks - key - if(ks1.isEmpty) - HashSet.empty[A] - else if(ks1.tail.isEmpty) - new HashSet1(ks1.head, hash) - else - new HashSetCollision1(hash, ks1) + ks1.size match { + case 0 => + // the empty set + null + case size if size == ks.size => + // We do this check first since even if the result is of size 1 since + // it is preferable to return the existing set for better structural sharing + // we can not rely on ks.- returning the same instance if we subtract an element that is not in it + // so we need to do the size check + this + case 1 => + // create a new HashSet1 with the hash we already know + new HashSet1(ks1.head, hash) + case _ => + // create a new HashSetCollison with the hash we already know and the new keys + new HashSetCollision1(hash, ks1) + } } else this override protected def filter0(p: A => Boolean, negate: Boolean, level: Int, buffer: Array[HashSet[A]], offset0: Int): HashSet[A] = { @@ -345,6 +560,284 @@ object HashSet extends ImmutableSetFactory[HashSet] { } } + override private[immutable] def union0(that: LeafHashSet[A], level: Int): HashSet[A] = { + val index = (that.hash >>> level) & 0x1f + val mask = (1 << index) + val offset = Integer.bitCount(bitmap & (mask - 1)) + if ((bitmap & mask) != 0) { + val sub = elems(offset) + val sub1 = sub.union0(that, level + 5) + if (sub eq sub1) this + else { + val elems1 = new Array[HashSet[A]](elems.length) + Array.copy(elems, 0, elems1, 0, elems.length) + elems1(offset) = sub1 + new HashTrieSet(bitmap, elems1, size + (sub1.size - sub.size)) + } + } else { + val elems1 = new Array[HashSet[A]](elems.length + 1) + Array.copy(elems, 0, elems1, 0, offset) + elems1(offset) = that + Array.copy(elems, offset, elems1, offset + 1, elems.length - offset) + val bitmap1 = bitmap | mask + new HashTrieSet(bitmap1, elems1, size + that.size) + } + } + + override private[immutable] def union0(that: HashSet[A], level: Int, buffer: Array[HashSet[A]], offset0: Int): HashSet[A] = that match { + case that if that eq this => + // shortcut for when that is this + // this happens often for nodes deeper in the tree, especially when that and this share a common "heritage" + // e.g. you have a large set A and do some small operations (adding and removing elements) to it to create B + // then A and B will have the vast majority of nodes in common, and this eq check will allow not even looking + // at these nodes. + this + case that: LeafHashSet[A] => + // when that is a leaf, we can switch to the simpler Tree/Leaf implementation + this.union0(that, level) + case that: HashTrieSet[A] => + val a = this.elems + var abm = this.bitmap + var ai = 0 + + val b = that.elems + var bbm = that.bitmap + var bi = 0 + + // fetch a new temporary array that is guaranteed to be big enough (32 elements) + var offset = offset0 + var rs = 0 + + // loop as long as there are bits left in either abm or bbm + while ((abm | bbm) != 0) { + // lowest remaining bit in abm + val alsb = abm ^ (abm & (abm - 1)) + // lowest remaining bit in bbm + val blsb = bbm ^ (bbm & (bbm - 1)) + if (alsb == blsb) { + val sub1 = a(ai).union0(b(bi), level + 5, buffer, offset) + rs += sub1.size + buffer(offset) = sub1 + offset += 1 + // clear lowest remaining one bit in abm and increase the a index + abm &= ~alsb + ai += 1 + // clear lowest remaining one bit in bbm and increase the b index + bbm &= ~blsb + bi += 1 + } else if (unsignedCompare(alsb - 1, blsb - 1)) { + // alsb is smaller than blsb, or alsb is set and blsb is 0 + // in any case, alsb is guaranteed to be set here! + val sub1 = a(ai) + rs += sub1.size + buffer(offset) = sub1 + offset += 1 + // clear lowest remaining one bit in abm and increase the a index + abm &= ~alsb + ai += 1 + } else { + // blsb is smaller than alsb, or blsb is set and alsb is 0 + // in any case, blsb is guaranteed to be set here! + val sub1 = b(bi) + rs += sub1.size + buffer(offset) = sub1 + offset += 1 + // clear lowest remaining one bit in bbm and increase the b index + bbm &= ~blsb + bi += 1 + } + } + if (rs == this.size) { + // if the result would be identical to this, we might as well return this + this + } else if (rs == that.size) { + // if the result would be identical to that, we might as well return that + that + } else { + // we don't have to check whether the result is a leaf, since union will only make the set larger + // and this is not a leaf to begin with. + val length = offset - offset0 + val elems = new Array[HashSet[A]](length) + System.arraycopy(buffer, offset0, elems, 0, length) + new HashTrieSet(this.bitmap | that.bitmap, elems, rs) + } + case _ => this + } + + override private[immutable] def intersect0(that: HashSet[A], level: Int, buffer: Array[HashSet[A]], offset0: Int): HashSet[A] = that match { + case that if that eq this => + // shortcut for when that is this + // this happens often for nodes deeper in the tree, especially when that and this share a common "heritage" + // e.g. you have a large set A and do some small operations (adding and removing elements) to it to create B + // then A and B will have the vast majority of nodes in common, and this eq check will allow not even looking + // at these nodes! + this + case that: LeafHashSet[A] => + // when that is a leaf, we can switch to the simpler Tree/Leaf implementation + // it is OK to swap the arguments because intersect is symmetric + // (we can't do this in case of diff, which is not symmetric) + that.intersect0(this, level, buffer, offset0) + case that: HashTrieSet[A] => + val a = this.elems + var abm = this.bitmap + var ai = 0 + + val b = that.elems + var bbm = that.bitmap + var bi = 0 + + // if the bitmasks do not overlap, the result is definitely empty so we can abort here + if ((abm & bbm) == 0) + return null + + // fetch a new temporary array that is guaranteed to be big enough (32 elements) + var offset = offset0 + var rs = 0 + var rbm = 0 + + // loop as long as there are bits left that are set in both abm and bbm + while ((abm & bbm) != 0) { + // highest remaining bit in abm + val alsb = abm ^ (abm & (abm - 1)) + // highest remaining bit in bbm + val blsb = bbm ^ (bbm & (bbm - 1)) + if (alsb == blsb) { + val sub1 = a(ai).intersect0(b(bi), level + 5, buffer, offset) + if (sub1 ne null) { + rs += sub1.size + rbm |= alsb + buffer(offset) = sub1 + offset += 1 + } + // clear lowest remaining one bit in abm and increase the a index + abm &= ~alsb; + ai += 1 + // clear lowest remaining one bit in bbm and increase the b index + bbm &= ~blsb; + bi += 1 + } else if (unsignedCompare(alsb - 1, blsb - 1)) { + // alsb is smaller than blsb, or alsb is set and blsb is 0 + // in any case, alsb is guaranteed to be set here! + // clear lowest remaining one bit in abm and increase the a index + abm &= ~alsb; + ai += 1 + } else { + // blsb is smaller than alsb, or blsb is set and alsb is 0 + // in any case, blsb is guaranteed to be set here! + // clear lowest remaining one bit in bbm and increase the b index + bbm &= ~blsb; + bi += 1 + } + } + + if (rbm == 0) { + // if the result bitmap is empty, the result is the empty set + null + } else if (rs == size0) { + // if the result has the same number of elements as this, it must be identical to this, + // so we might as well return this + this + } else if (rs == that.size0) { + // if the result has the same number of elements as that, it must be identical to that, + // so we might as well return that + that + } else { + val length = offset - offset0 + if (length == 1 && !buffer(offset0).isInstanceOf[HashTrieSet[A]]) + buffer(offset0) + else { + val elems = new Array[HashSet[A]](length) + System.arraycopy(buffer, offset0, elems, 0, length) + new HashTrieSet[A](rbm, elems, rs) + } + } + case _ => null + } + + override private[immutable] def diff0(that: HashSet[A], level: Int, buffer: Array[HashSet[A]], offset0: Int): HashSet[A] = that match { + case that if that eq this => + // shortcut for when that is this + // this happens often for nodes deeper in the tree, especially when that and this share a common "heritage" + // e.g. you have a large set A and do some small operations (adding and removing elements) to it to create B + // then A and B will have the vast majority of nodes in common, and this eq check will allow not even looking + // at these nodes! + null + case that: HashSet1[A] => + removed0(that.key, that.hash, level) + case that: HashTrieSet[A] => + val a = this.elems + var abm = this.bitmap + var ai = 0 + + val b = that.elems + var bbm = that.bitmap + var bi = 0 + + // fetch a new temporary array that is guaranteed to be big enough (32 elements) + var offset = offset0 + var rs = 0 + var rbm = 0 + + // loop until there are no more bits in abm + while(abm!=0) { + // highest remaining bit in abm + val alsb = abm ^ (abm & (abm - 1)) + // highest remaining bit in bbm + val blsb = bbm ^ (bbm & (bbm - 1)) + if (alsb == blsb) { + val sub1 = a(ai).diff0(b(bi), level + 5, buffer, offset) + if (sub1 ne null) { + rs += sub1.size + rbm |= alsb + buffer(offset) = sub1 + offset += 1 + } + // clear lowest remaining one bit in abm and increase the a index + abm &= ~alsb; ai += 1 + // clear lowest remaining one bit in bbm and increase the b index + bbm &= ~blsb; bi += 1 + } else if (unsignedCompare(alsb - 1, blsb - 1)) { + // alsb is smaller than blsb, or alsb is set and blsb is 0 + // in any case, alsb is guaranteed to be set here! + val sub1 = a(ai) + rs += sub1.size + rbm |= alsb + buffer(offset) = sub1; offset += 1 + // clear lowest remaining one bit in abm and increase the a index + abm &= ~alsb; ai += 1 + } else { + // blsb is smaller than alsb, or blsb is set and alsb is 0 + // in any case, blsb is guaranteed to be set here! + // clear lowest remaining one bit in bbm and increase the b index + bbm &= ~blsb; bi += 1 + } + } + if (rbm == 0) { + null + } else if (rs == this.size0) { + // if the result has the same number of elements as this, it must be identical to this, + // so we might as well return this + this + } else { + val length = offset - offset0 + if (length == 1 && !buffer(offset0).isInstanceOf[HashTrieSet[A]]) + buffer(offset0) + else { + val elems = new Array[HashSet[A]](length) + System.arraycopy(buffer, offset0, elems, 0, length) + new HashTrieSet[A](rbm, elems, rs) + } + } + case that: HashSetCollision1[A] => + // we remove the elements using removed0 so we can use the fact that we know the hash of all elements + // to be removed + @tailrec def removeAll(s:HashSet[A], r:ListSet[A]) : HashSet[A] = + if(r.isEmpty || (s eq null)) s + else removeAll(s.removed0(r.head, that.hash, level), r.tail) + removeAll(this, that.ks) + case _ => this + } + override def removed0(key: A, hash: Int, level: Int): HashSet[A] = { val index = (hash >>> level) & 0x1f val mask = (1 << index) @@ -353,7 +846,7 @@ object HashSet extends ImmutableSetFactory[HashSet] { val sub = elems(offset) val subNew = sub.removed0(key, hash, level + 5) if (sub eq subNew) this - else if (subNew.isEmpty) { + else if (subNew eq null) { val bitmapNew = bitmap ^ mask if (bitmapNew != 0) { val elemsNew = new Array[HashSet[A]](elems.length - 1) @@ -367,7 +860,7 @@ object HashSet extends ImmutableSetFactory[HashSet] { else new HashTrieSet(bitmapNew, elemsNew, sizeNew) } else - HashSet.empty[A] + null } else { val elemsNew = new Array[HashSet[A]](elems.length) Array.copy(elems, 0, elemsNew, 0, elems.length) @@ -526,6 +1019,10 @@ object HashSet extends ImmutableSetFactory[HashSet] { result } + // unsigned comparison + @inline private[this] def unsignedCompare(i: Int, j: Int) = + (i < j) ^ (i < 0) ^ (j < 0) + @SerialVersionUID(2L) private class SerializationProxy[A,B](@transient private var orig: HashSet[A]) extends Serializable { private def writeObject(out: java.io.ObjectOutputStream) { val s = orig.size diff --git a/test/files/run/t6253a.scala b/test/files/run/t6253a.scala new file mode 100644 index 000000000000..efa3230df684 --- /dev/null +++ b/test/files/run/t6253a.scala @@ -0,0 +1,64 @@ +import scala.collection.immutable.HashSet + +object Test extends App { + + var hashCount = 0 + + /** + * A key that produces lots of hash collisions, to exercise the part of the code that deals with those + */ + case class Collision(value: Int) { + + override def hashCode = { + // we do not check hash counts for Collision keys because ListSet.++ uses a mutable hashset internally, + // so when we have hash collisions, union will call key.hashCode. + // hashCount += 1 + value / 5 + } + } + + /** + * A key that is identical to int other than that it counts hashCode invocations + */ + case class HashCounter(value: Int) { + + override def hashCode = { + hashCount += 1 + value + } + } + + def testUnion[T](sizes: Seq[Int], offsets: Seq[Double], keyType: String, mkKey: Int => T) { + for { + i <- sizes + o <- offsets + } { + val e = HashSet.empty[T] + val j = (i * o).toInt + // create two sets of size i with overlap o + val a = e ++ (0 until i).map(mkKey) + require(a.size == i, s"Building HashSet of size $i failed. Key type $keyType.") + val b = e ++ (j until (i + j)).map(mkKey) + require(b.size == i, s"Building HashSet of size $i failed. Key type $keyType.") + val as = e ++ (0 until j).map(mkKey) + require(as.size == j, s"Building HashSet of size $j failed. Key type $keyType.") + val hashCount0 = hashCount + val u = a union b + require(hashCount == hashCount0, s"key.hashCode should not be called, but has been called ${hashCount - hashCount0} times. Key type $keyType.") + require(u == (a union scala.collection.mutable.HashSet(b.toSeq: _*)), s"Operation must still work for other sets!") + require(u.size == i + j, s"Expected size ${i+j}. Real size ${u.size}. Key type $keyType.") + for (x <- 0 until i + j) + require(u.contains(mkKey(x)), s"Key type $keyType. Set (0 until ${i + j}) should contain $x but does not.") + val a_as = a union as + val as_a = as union a + require((a_as eq a) || (a_as eq as), s"No structural sharing in a union as. Key type $keyType, a=(0 until $i) as=(0 until $j)") + require((as_a eq a) || (as_a eq as), s"No structural sharing in as union a. Key type $keyType, a=(0 until $i) as=(0 until $j)") + } + } + + val sizes = Seq(1, 10, 100, 1000, 10000, 100000) + val offsets = Seq(0.0, 0.25, 0.5, 0.75, 1.0) + testUnion(sizes, offsets, "int", identity[Int]) + testUnion(sizes, offsets, "hashcounter", HashCounter.apply) + testUnion(sizes, offsets, "collision", Collision.apply) +} diff --git a/test/files/run/t6253b.scala b/test/files/run/t6253b.scala new file mode 100644 index 000000000000..9cbfefd49e2a --- /dev/null +++ b/test/files/run/t6253b.scala @@ -0,0 +1,62 @@ +import scala.collection.immutable.HashSet + +object Test extends App { + + var hashCount = 0 + + /** + * A key that produces lots of hash collisions, to exercise the part of the code that deals with those + */ + case class Collision(value: Int) { + + override def hashCode = { + hashCount += 1 + value / 5 + } + } + + /** + * A key that is identical to int other than that it counts hashCode invocations + */ + case class HashCounter(value: Int) { + + override def hashCode = { + hashCount += 1 + value + } + } + + def testIntersect[T](sizes: Seq[Int], offsets: Seq[Double], keyType: String, mkKey: Int => T) { + for { + i <- sizes + o <- offsets + } { + val e = HashSet.empty[T] + val j = (i * o).toInt + // create two sets of size i with overlap o + val a = e ++ (0 until i).map(mkKey) + require(a.size == i, s"Building HashSet of size $i failed. Key type $keyType.") + val b = e ++ (j until (i + j)).map(mkKey) + require(b.size == i, s"Building HashSet of size $i failed. Key type $keyType.") + val as = e ++ (0 until j).map(mkKey) + require(as.size == j, s"Building HashSet of size $j failed. Key type $keyType.") + val hashCount0 = hashCount + val u = a intersect b + require(hashCount == hashCount0, s"key.hashCode should not be called, but has been called ${hashCount - hashCount0} times. Key type $keyType.") + require(u == (a intersect scala.collection.mutable.HashSet(b.toSeq: _*)), s"Operation must still work for other sets!") + require(u.size == i - j, s"Expected size ${i + j}. Real size ${u.size}. Key type $keyType.") + for (x <- j until i) + require(u.contains(mkKey(x)), s"Key type $keyType. Set (0 until ${i + j}) should contain $x but does not.") + val a_as = a intersect as + val as_a = as intersect a + require((a_as eq as) || (a_as eq a), s"No structural sharing in a intersect as. Key type $keyType, a=(0 until $i) as=(0 until $j)") + require((as_a eq as) || (as_a eq a), s"No structural sharing in as intersect a. Key type $keyType, a=(0 until $i) as=(0 until $j)") + } + } + + val sizes = Seq(1, 10, 100, 1000, 10000, 100000) + val offsets = Seq(0.0, 0.25, 0.5, 0.75, 1.0) + testIntersect(sizes, offsets, "int", identity[Int]) + testIntersect(sizes, offsets, "hashcounter", HashCounter.apply) + testIntersect(sizes, offsets, "collision", Collision.apply) +} diff --git a/test/files/run/t6253c.scala b/test/files/run/t6253c.scala new file mode 100644 index 000000000000..71dfe1473ef7 --- /dev/null +++ b/test/files/run/t6253c.scala @@ -0,0 +1,63 @@ +import scala.collection.immutable.HashSet + +object Test extends App { + + var hashCount = 0 + + /** + * A key that produces lots of hash collisions, to exercise the part of the code that deals with those + */ + case class Collision(value: Int) { + + override def hashCode = { + hashCount += 1 + value / 5 + } + } + + /** + * A key that is identical to int other than that it counts hashCode invocations + */ + case class HashCounter(value: Int) { + + override def hashCode = { + hashCount += 1 + value + } + } + + def testDiff[T](sizes: Seq[Int], offsets: Seq[Double], keyType: String, mkKey: Int => T) { + for { + i <- sizes + o <- offsets + } { + val e = HashSet.empty[T] + val j = (i * o).toInt + // create two sets of size i with overlap o + val a = e ++ (0 until i).map(mkKey) + require(a.size == i, s"Building HashSet of size $i failed. Key type $keyType.") + val b = e ++ (j until (i + j)).map(mkKey) + require(b.size == i, s"Building HashSet of size $i failed. Key type $keyType.") + val as = e ++ (0 until j).map(mkKey) + require(as.size == j, s"Building HashSet of size $j failed. Key type $keyType.") + val hashCount0 = hashCount + val u = a diff b + require(hashCount == hashCount0, s"key.hashCode should not be called, but has been called ${hashCount - hashCount0} times. Key type $keyType.") + require(u == (a diff scala.collection.mutable.HashSet(b.toSeq: _*)), s"Operation must still work for other sets!") + require(u.size == j, s"Expected size $j. Real size ${u.size}. Key type $keyType.") + for (x <- 0 until j) + require(u.contains(mkKey(x)), s"Key type $keyType. Set (0 until ${i + j}) should contain $x but does not.") + require((as intersect b).isEmpty) + val b_as = b diff as + val as_b = as diff b + require((b_as eq b) || (b_as eq as), s"No structural sharing in b diff as. Key type $keyType, b=($j until ${i + j}) as=(0 until $j)") + require((as_b eq b) || (as_b eq as), s"No structural sharing in as diff b. Key type $keyType, b=($j until ${i + j}) as=(0 until $j)") + } + } + + val sizes = Seq(1, 10, 100, 1000, 10000, 100000) + val offsets = Seq(0.0, 0.25, 0.5, 0.75, 1.0) + testDiff(sizes, offsets, "int", identity[Int]) + testDiff(sizes, offsets, "hashCounter", HashCounter.apply) + testDiff(sizes, offsets, "collision", Collision.apply) +}