From 022b387575ca042ecddf61a2420194b89ada73e8 Mon Sep 17 00:00:00 2001 From: mr-eyes Date: Sun, 6 Jun 2021 20:15:44 +0200 Subject: [PATCH 1/7] draft test case that should fail --- tests/test_minhash.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/test_minhash.py b/tests/test_minhash.py index 79d8a8a9e7..1c9f8bda34 100644 --- a/tests/test_minhash.py +++ b/tests/test_minhash.py @@ -1414,6 +1414,30 @@ def test_remove_many(track_abundance): assert len(a) == 33 assert all(c % 6 != 0 for c in a.hashes) +# tmp name / draft test case +def test_remove_many(track_abundance): + import sys + + original_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) + added_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) + tested_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) + + original_mh.add_many(list(range(101))) + added_mh.add_many(list(range(101,201))) # contains original in it + tested_mh.add_many(list(range(201))) # original + added + + # Now we should expect tested_minhash == original_minhash + tested_mh.remove_many(added_mh) + + # Assertion + original_sig = signature.SourmashSignature(original_mh) + tested_sig = signature.SourmashSignature(tested_mh) + + # Should pass if the hashes list in the same order + assert original_mh.hashes == tested_mh.hashes + assert len(original_mh) == len(tested_mh) + assert original_sig.md5sum() == tested_sig.md5sum() + def test_add_many(track_abundance): a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) From d6c5660ca9ac27a80ce6c02ab72da415538fab5c Mon Sep 17 00:00:00 2001 From: mr-eyes Date: Sun, 6 Jun 2021 20:52:41 +0200 Subject: [PATCH 2/7] rename test case --- tests/test_minhash.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_minhash.py b/tests/test_minhash.py index 1c9f8bda34..bacfbeb457 100644 --- a/tests/test_minhash.py +++ b/tests/test_minhash.py @@ -1415,9 +1415,7 @@ def test_remove_many(track_abundance): assert all(c % 6 != 0 for c in a.hashes) # tmp name / draft test case -def test_remove_many(track_abundance): - import sys - +def test_remove_many_new(track_abundance): original_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) added_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) tested_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) From 8107bc0cdfc8072a082f4d78877feee1daa80bf6 Mon Sep 17 00:00:00 2001 From: mr-eyes Date: Sun, 6 Jun 2021 23:28:03 +0200 Subject: [PATCH 3/7] adding rust:remove_from(MinHash) --- include/sourmash.h | 2 ++ src/core/src/ffi/minhash.rs | 9 +++++++++ src/core/src/sketch/minhash.rs | 8 ++++++++ 3 files changed, 19 insertions(+) diff --git a/include/sourmash.h b/include/sourmash.h index 7f88fcc203..464902c6a7 100644 --- a/include/sourmash.h +++ b/include/sourmash.h @@ -227,6 +227,8 @@ uint32_t kmerminhash_num(const SourmashKmerMinHash *ptr); void kmerminhash_remove_hash(SourmashKmerMinHash *ptr, uint64_t h); +void kmerminhash_remove_from(SourmashKmerMinHash *ptr, const SourmashKmerMinHash *other); + void kmerminhash_remove_many(SourmashKmerMinHash *ptr, const uint64_t *hashes_ptr, uintptr_t insize); diff --git a/src/core/src/ffi/minhash.rs b/src/core/src/ffi/minhash.rs index 1618b9de62..7c3c6358c5 100644 --- a/src/core/src/ffi/minhash.rs +++ b/src/core/src/ffi/minhash.rs @@ -366,6 +366,15 @@ unsafe fn kmerminhash_add_from(ptr: *mut SourmashKmerMinHash, other: *const Sour } } +ffi_fn! { + unsafe fn kmerminhash_remove_from(ptr: *mut SourmashKmerMinHash, other: *const SourmashKmerMinHash) + -> Result<()> { + let mh = SourmashKmerMinHash::as_rust_mut(ptr); + let other_mh = SourmashKmerMinHash::as_rust(other); + mh.remove_from(other_mh) + } +} + ffi_fn! { unsafe fn kmerminhash_count_common(ptr: *const SourmashKmerMinHash, other: *const SourmashKmerMinHash, downsample: bool) -> Result { diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index a9d8a9d778..b7300af49a 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -415,6 +415,14 @@ impl KmerMinHash { }; } + + pub fn remove_from(&mut self, other: &KmerMinHash) -> Result<(), Error> { + for min in &other.mins { + self.remove_hash(*min); + } + Ok(()) + } + pub fn remove_many(&mut self, hashes: &[u64]) -> Result<(), Error> { for min in hashes { self.remove_hash(*min); From e4e80d5d61f65ac255e4d6e01ff0b9edd652e8dc Mon Sep 17 00:00:00 2001 From: mr-eyes Date: Sun, 6 Jun 2021 23:28:41 +0200 Subject: [PATCH 4/7] python:remove_many accepts MinHash --- src/sourmash/minhash.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py index cce9077034..55fb0901aa 100644 --- a/src/sourmash/minhash.py +++ b/src/sourmash/minhash.py @@ -307,8 +307,15 @@ def add_many(self, hashes): self._methodcall(lib.kmerminhash_add_many, list(hashes), len(hashes)) def remove_many(self, hashes): - "Remove many hashes at once; ``hashes`` must be an iterable." - self._methodcall(lib.kmerminhash_remove_many, list(hashes), len(hashes)) + """Remove many hashes from a sketch at once. + + ``hashes`` can be either an iterable (list, set, etc.), or another + ``MinHash`` object. + """ + if isinstance(hashes, MinHash): + self._methodcall(lib.kmerminhash_remove_from, hashes._objptr) + else: + self._methodcall(lib.kmerminhash_remove_many, list(hashes), len(hashes)) def __len__(self): "Number of hashes." From f080250fe120e56ec526fd6e461114f773e1791d Mon Sep 17 00:00:00 2001 From: mr-eyes Date: Sun, 6 Jun 2021 23:29:26 +0200 Subject: [PATCH 5/7] rename add_many(MinHash) test case --- tests/test_minhash.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_minhash.py b/tests/test_minhash.py index bacfbeb457..c6a3326442 100644 --- a/tests/test_minhash.py +++ b/tests/test_minhash.py @@ -1414,8 +1414,7 @@ def test_remove_many(track_abundance): assert len(a) == 33 assert all(c % 6 != 0 for c in a.hashes) -# tmp name / draft test case -def test_remove_many_new(track_abundance): +def test_remove_minhash(track_abundance): original_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) added_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) tested_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) @@ -1425,6 +1424,7 @@ def test_remove_many_new(track_abundance): tested_mh.add_many(list(range(201))) # original + added # Now we should expect tested_minhash == original_minhash + # Note we are passing a MinHash object instead of an iterable object tested_mh.remove_many(added_mh) # Assertion From f6669f29b1d08c16a69e126183b0e1a4abb783bf Mon Sep 17 00:00:00 2001 From: Mohamed Abuelanin Date: Mon, 7 Jun 2021 20:11:45 +0200 Subject: [PATCH 6/7] Update include/sourmash.h Co-authored-by: Luiz Irber --- include/sourmash.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/sourmash.h b/include/sourmash.h index 464902c6a7..a7bb4a9b71 100644 --- a/include/sourmash.h +++ b/include/sourmash.h @@ -225,10 +225,10 @@ SourmashKmerMinHash *kmerminhash_new(uint64_t scaled, uint32_t kmerminhash_num(const SourmashKmerMinHash *ptr); -void kmerminhash_remove_hash(SourmashKmerMinHash *ptr, uint64_t h); - void kmerminhash_remove_from(SourmashKmerMinHash *ptr, const SourmashKmerMinHash *other); +void kmerminhash_remove_hash(SourmashKmerMinHash *ptr, uint64_t h); + void kmerminhash_remove_many(SourmashKmerMinHash *ptr, const uint64_t *hashes_ptr, uintptr_t insize); From 9c2d6fa7ba4054525b5fcdd8d724251f40221ecd Mon Sep 17 00:00:00 2001 From: mr-eyes Date: Mon, 7 Jun 2021 20:52:09 +0200 Subject: [PATCH 7/7] rust formatting --- src/core/src/sketch/minhash.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index b7300af49a..a1a7bc80e0 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -415,7 +415,6 @@ impl KmerMinHash { }; } - pub fn remove_from(&mut self, other: &KmerMinHash) -> Result<(), Error> { for min in &other.mins { self.remove_hash(*min);