From 465a06d07e03f1e7ee68ca1f5e88a3d0273441ad Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 5 Feb 2021 19:45:35 -0800 Subject: [PATCH] Refactor/expand MinHash add, iadd, and merge functionality. (#1282) * add __add__ on MinHash objects, do simple testing * fix aberrant merge tests --- src/sourmash/minhash.py | 15 +++- tests/test__minhash.py | 152 +++++++++++++++++++++++++++++++++------- 2 files changed, 139 insertions(+), 28 deletions(-) diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py index 07e6dc454e..e419b2b091 100644 --- a/src/sourmash/minhash.py +++ b/src/sourmash/minhash.py @@ -541,13 +541,24 @@ def contained_by(self, other, downsample=False): return self.count_common(other, downsample) / len(self) + def __add__(self, other): + if not isinstance(other, MinHash): + raise TypeError("can only add MinHash objects to MinHash objects!") + + new_obj = self.__copy__() + new_obj += other + return new_obj + def __iadd__(self, other): if not isinstance(other, MinHash): - raise TypeError("Must be a MinHash!") + raise TypeError("can only add MinHash objects to MinHash objects!") self._methodcall(lib.kmerminhash_merge, other._get_objptr()) return self - merge = __iadd__ + def merge(self, other): + if not isinstance(other, MinHash): + raise TypeError("can only add MinHash objects to MinHash objects!") + self._methodcall(lib.kmerminhash_merge, other._get_objptr()) def set_abundances(self, values, clear=True): """Set abundances for hashes from ``values``, where diff --git a/tests/test__minhash.py b/tests/test__minhash.py index 202f9b7509..c05dbce6ac 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -650,43 +650,45 @@ def test_mh_merge_typeerror(track_abundance): def test_mh_merge(track_abundance): # test merging two identically configured minhashes - a = MinHash(20, 10, track_abundance=track_abundance) + a = MinHash(100, 10, track_abundance=track_abundance) for i in range(0, 40, 2): a.add_hash(i) - b = MinHash(20, 10, track_abundance=track_abundance) + b = MinHash(100, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) - c = a.merge(b) - d = b.merge(a) + c = a.__copy__() + c.merge(b) + + d = b.__copy__() + d.merge(a) assert len(c) == len(d) - assert list(sorted(c.hashes)) == list(sorted(d.hashes)) + assert list(sorted(c.hashes.items())) == list(sorted(d.hashes.items())) - if track_abundance: - assert round(c.similarity(d), 3) == 0.91 - assert round(d.similarity(c), 3) == 0.91 - else: - assert round(c.similarity(d), 3) == 1.0 - assert round(d.similarity(c), 3) == 1.0 + assert round(c.similarity(d), 3) == 1.0 + assert round(d.similarity(c), 3) == 1.0 def test_mh_merge_empty_num(track_abundance): # test merging two identically configured minhashes, one empty - a = MinHash(20, 10, track_abundance=track_abundance) + a = MinHash(100, 10, track_abundance=track_abundance) - b = MinHash(20, 10, track_abundance=track_abundance) + b = MinHash(100, 10, track_abundance=track_abundance) for i in range(0, 80, 4): b.add_hash(i) - c = a.merge(b) - d = b.merge(a) + c = a.__copy__() + c.merge(b) + + d = b.__copy__() + d.merge(a) assert len(c) assert len(c) == len(d) - assert list(sorted(c.hashes)) == list(sorted(d.hashes)) + assert list(sorted(c.hashes.items())) == list(sorted(d.hashes.items())) assert round(c.similarity(d), 3) == 1.0 assert round(d.similarity(c), 3) == 1.0 @@ -699,13 +701,16 @@ def test_mh_merge_empty_scaled(track_abundance): for i in range(0, 80, 4): b.add_hash(i) - c = a.merge(b) - d = b.merge(a) + c = a.__copy__() + c.merge(b) + + d = b.__copy__() + d.merge(a) assert len(c) assert len(c) == len(d) - assert list(sorted(c.hashes)) == list(sorted(d.hashes)) + assert list(sorted(c.hashes.items())) == list(sorted(d.hashes.items())) assert round(c.similarity(d), 3) == 1.0 assert round(d.similarity(c), 3) == 1.0 @@ -719,7 +724,8 @@ def test_mh_merge_check_length(track_abundance): for i in range(0, 80, 4): b.add_hash(i) - c = a.merge(b) + c = a.__copy__() + c.merge(b) assert len(c.hashes) == 20 @@ -735,7 +741,8 @@ def test_mh_merge_check_length2(track_abundance): b.add_hash(1) b.add_hash(4) - c = a.merge(b) + c = a.__copy__() + c.merge(b) assert len(c.hashes) == 3 def test_mh_asymmetric_merge(track_abundance): @@ -749,8 +756,10 @@ def test_mh_asymmetric_merge(track_abundance): for i in range(0, 80, 4): b.add_hash(i) - c = a.merge(b) - d = b.merge(a) + c = a.__copy__() + c.merge(b) + d = b.__copy__() + d.merge(a) assert len(a) == 20 assert len(b) == 10 @@ -764,15 +773,15 @@ def test_mh_asymmetric_merge(track_abundance): a = a.downsample(num=d.num) if track_abundance: - assert round(d.similarity(a), 3) == 0.91 + assert round(d.similarity(a), 3) == 0.795 else: assert round(d.similarity(a), 3) == 1.0 c = c.downsample(num=b.num) if track_abundance: - assert round(c.similarity(b), 3) == 0.91 + assert round(c.similarity(b), 3) == 0.436 else: - assert c.similarity(b) == 1.0 + assert c.similarity(b) == 0.5 def test_mh_inplace_concat_asymmetric(track_abundance): @@ -1553,3 +1562,94 @@ def test_is_molecule_type_4(track_abundance): assert not mh.is_protein assert not mh.hp assert mh.dayhoff + + +def test_addition_abund(): + mh1 = MinHash(10, 21, track_abundance=True) + mh2 = MinHash(10, 21, track_abundance=True) + + mh1.set_abundances({ 0: 1 }) + mh2.set_abundances({ 0: 3 }) + + mh3 = mh1 + mh2 + hashcounts = mh3.hashes + assert len(hashcounts) == 1 + + assert hashcounts[0] == 4 + + +def test_addition_noabund(): + mh1 = MinHash(10, 21, track_abundance=False) + mh2 = MinHash(10, 21, track_abundance=False) + + mh1.add_hash(0) + mh2.add_hash(0) + + mh3 = mh1 + mh2 + hashcounts = mh3.hashes + assert len(hashcounts) == 1 + assert hashcounts[0] == 1 + + +def test_iaddition_abund(): + mh1 = MinHash(10, 21, track_abundance=True) + mh2 = MinHash(10, 21, track_abundance=True) + + mh1.set_abundances({ 0: 1 }) + mh2.set_abundances({ 0: 3 }) + + mh1 += mh2 + hashcounts = mh1.hashes + assert len(hashcounts) == 1 + assert hashcounts[0] == 4 + + hashcounts2 = mh2.hashes + assert len(hashcounts2) == 1 + assert hashcounts2[0] == 3 + + +def test_iaddition_noabund(): + mh1 = MinHash(10, 21, track_abundance=False) + mh2 = MinHash(10, 21, track_abundance=False) + + mh1.add_hash(0) + mh2.add_hash(0) + + mh1 += mh2 + hashcounts = mh1.hashes + assert len(hashcounts) == 1 + assert hashcounts[0] == 1 + + +def test_merge_abund(): + mh1 = MinHash(10, 21, track_abundance=True) + mh2 = MinHash(10, 21, track_abundance=True) + + mh1.set_abundances({ 0: 1 }) + mh2.set_abundances({ 0: 3 }) + + ret = mh1.merge(mh2) + assert ret is None + + hashcounts = mh1.hashes + assert len(hashcounts) == 1 + assert hashcounts[0] == 4 + + hashcounts2 = mh2.hashes + assert len(hashcounts2) == 1 + assert hashcounts2[0] == 3 + + +def test_merge_noabund(): + mh1 = MinHash(10, 21, track_abundance=False) + mh2 = MinHash(10, 21, track_abundance=False) + + mh1.add_hash(0) + mh2.add_hash(0) + + ret = mh1.merge(mh2) + assert ret is None + + hashcounts = mh1.hashes + assert len(hashcounts) == 1 + assert hashcounts[0] == 1