Skip to content

Commit

Permalink
Refactor/expand MinHash add, iadd, and merge functionality. (#1282)
Browse files Browse the repository at this point in the history
* add __add__ on MinHash objects, do simple testing
* fix aberrant merge tests
  • Loading branch information
ctb committed Feb 6, 2021
1 parent bc87da7 commit 465a06d
Show file tree
Hide file tree
Showing 2 changed files with 139 additions and 28 deletions.
15 changes: 13 additions & 2 deletions src/sourmash/minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -541,13 +541,24 @@ def contained_by(self, other, downsample=False):

return self.count_common(other, downsample) / len(self)

def __add__(self, other):
if not isinstance(other, MinHash):
raise TypeError("can only add MinHash objects to MinHash objects!")

new_obj = self.__copy__()
new_obj += other
return new_obj

def __iadd__(self, other):
if not isinstance(other, MinHash):
raise TypeError("Must be a MinHash!")
raise TypeError("can only add MinHash objects to MinHash objects!")
self._methodcall(lib.kmerminhash_merge, other._get_objptr())
return self

merge = __iadd__
def merge(self, other):
if not isinstance(other, MinHash):
raise TypeError("can only add MinHash objects to MinHash objects!")
self._methodcall(lib.kmerminhash_merge, other._get_objptr())

def set_abundances(self, values, clear=True):
"""Set abundances for hashes from ``values``, where
Expand Down
152 changes: 126 additions & 26 deletions tests/test__minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -650,43 +650,45 @@ def test_mh_merge_typeerror(track_abundance):

def test_mh_merge(track_abundance):
# test merging two identically configured minhashes
a = MinHash(20, 10, track_abundance=track_abundance)
a = MinHash(100, 10, track_abundance=track_abundance)
for i in range(0, 40, 2):
a.add_hash(i)

b = MinHash(20, 10, track_abundance=track_abundance)
b = MinHash(100, 10, track_abundance=track_abundance)
for i in range(0, 80, 4):
b.add_hash(i)

c = a.merge(b)
d = b.merge(a)
c = a.__copy__()
c.merge(b)

d = b.__copy__()
d.merge(a)

assert len(c) == len(d)
assert list(sorted(c.hashes)) == list(sorted(d.hashes))
assert list(sorted(c.hashes.items())) == list(sorted(d.hashes.items()))

if track_abundance:
assert round(c.similarity(d), 3) == 0.91
assert round(d.similarity(c), 3) == 0.91
else:
assert round(c.similarity(d), 3) == 1.0
assert round(d.similarity(c), 3) == 1.0
assert round(c.similarity(d), 3) == 1.0
assert round(d.similarity(c), 3) == 1.0


def test_mh_merge_empty_num(track_abundance):
# test merging two identically configured minhashes, one empty
a = MinHash(20, 10, track_abundance=track_abundance)
a = MinHash(100, 10, track_abundance=track_abundance)

b = MinHash(20, 10, track_abundance=track_abundance)
b = MinHash(100, 10, track_abundance=track_abundance)
for i in range(0, 80, 4):
b.add_hash(i)

c = a.merge(b)
d = b.merge(a)
c = a.__copy__()
c.merge(b)

d = b.__copy__()
d.merge(a)

assert len(c)
assert len(c) == len(d)

assert list(sorted(c.hashes)) == list(sorted(d.hashes))
assert list(sorted(c.hashes.items())) == list(sorted(d.hashes.items()))
assert round(c.similarity(d), 3) == 1.0
assert round(d.similarity(c), 3) == 1.0

Expand All @@ -699,13 +701,16 @@ def test_mh_merge_empty_scaled(track_abundance):
for i in range(0, 80, 4):
b.add_hash(i)

c = a.merge(b)
d = b.merge(a)
c = a.__copy__()
c.merge(b)

d = b.__copy__()
d.merge(a)

assert len(c)
assert len(c) == len(d)

assert list(sorted(c.hashes)) == list(sorted(d.hashes))
assert list(sorted(c.hashes.items())) == list(sorted(d.hashes.items()))
assert round(c.similarity(d), 3) == 1.0
assert round(d.similarity(c), 3) == 1.0

Expand All @@ -719,7 +724,8 @@ def test_mh_merge_check_length(track_abundance):
for i in range(0, 80, 4):
b.add_hash(i)

c = a.merge(b)
c = a.__copy__()
c.merge(b)
assert len(c.hashes) == 20


Expand All @@ -735,7 +741,8 @@ def test_mh_merge_check_length2(track_abundance):
b.add_hash(1)
b.add_hash(4)

c = a.merge(b)
c = a.__copy__()
c.merge(b)
assert len(c.hashes) == 3

def test_mh_asymmetric_merge(track_abundance):
Expand All @@ -749,8 +756,10 @@ def test_mh_asymmetric_merge(track_abundance):
for i in range(0, 80, 4):
b.add_hash(i)

c = a.merge(b)
d = b.merge(a)
c = a.__copy__()
c.merge(b)
d = b.__copy__()
d.merge(a)

assert len(a) == 20
assert len(b) == 10
Expand All @@ -764,15 +773,15 @@ def test_mh_asymmetric_merge(track_abundance):
a = a.downsample(num=d.num)

if track_abundance:
assert round(d.similarity(a), 3) == 0.91
assert round(d.similarity(a), 3) == 0.795
else:
assert round(d.similarity(a), 3) == 1.0

c = c.downsample(num=b.num)
if track_abundance:
assert round(c.similarity(b), 3) == 0.91
assert round(c.similarity(b), 3) == 0.436
else:
assert c.similarity(b) == 1.0
assert c.similarity(b) == 0.5


def test_mh_inplace_concat_asymmetric(track_abundance):
Expand Down Expand Up @@ -1553,3 +1562,94 @@ def test_is_molecule_type_4(track_abundance):
assert not mh.is_protein
assert not mh.hp
assert mh.dayhoff


def test_addition_abund():
mh1 = MinHash(10, 21, track_abundance=True)
mh2 = MinHash(10, 21, track_abundance=True)

mh1.set_abundances({ 0: 1 })
mh2.set_abundances({ 0: 3 })

mh3 = mh1 + mh2
hashcounts = mh3.hashes
assert len(hashcounts) == 1

assert hashcounts[0] == 4


def test_addition_noabund():
mh1 = MinHash(10, 21, track_abundance=False)
mh2 = MinHash(10, 21, track_abundance=False)

mh1.add_hash(0)
mh2.add_hash(0)

mh3 = mh1 + mh2
hashcounts = mh3.hashes
assert len(hashcounts) == 1
assert hashcounts[0] == 1


def test_iaddition_abund():
mh1 = MinHash(10, 21, track_abundance=True)
mh2 = MinHash(10, 21, track_abundance=True)

mh1.set_abundances({ 0: 1 })
mh2.set_abundances({ 0: 3 })

mh1 += mh2
hashcounts = mh1.hashes
assert len(hashcounts) == 1
assert hashcounts[0] == 4

hashcounts2 = mh2.hashes
assert len(hashcounts2) == 1
assert hashcounts2[0] == 3


def test_iaddition_noabund():
mh1 = MinHash(10, 21, track_abundance=False)
mh2 = MinHash(10, 21, track_abundance=False)

mh1.add_hash(0)
mh2.add_hash(0)

mh1 += mh2
hashcounts = mh1.hashes
assert len(hashcounts) == 1
assert hashcounts[0] == 1


def test_merge_abund():
mh1 = MinHash(10, 21, track_abundance=True)
mh2 = MinHash(10, 21, track_abundance=True)

mh1.set_abundances({ 0: 1 })
mh2.set_abundances({ 0: 3 })

ret = mh1.merge(mh2)
assert ret is None

hashcounts = mh1.hashes
assert len(hashcounts) == 1
assert hashcounts[0] == 4

hashcounts2 = mh2.hashes
assert len(hashcounts2) == 1
assert hashcounts2[0] == 3


def test_merge_noabund():
mh1 = MinHash(10, 21, track_abundance=False)
mh2 = MinHash(10, 21, track_abundance=False)

mh1.add_hash(0)
mh2.add_hash(0)

ret = mh1.merge(mh2)
assert ret is None

hashcounts = mh1.hashes
assert len(hashcounts) == 1
assert hashcounts[0] == 1

0 comments on commit 465a06d

Please sign in to comment.