Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] Improving MinHash.remove_many(...) performance #1571

Merged
merged 8 commits into from
Jun 7, 2021
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions include/sourmash.h
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,8 @@ uint32_t kmerminhash_num(const SourmashKmerMinHash *ptr);

void kmerminhash_remove_hash(SourmashKmerMinHash *ptr, uint64_t h);

void kmerminhash_remove_from(SourmashKmerMinHash *ptr, const SourmashKmerMinHash *other);

mr-eyes marked this conversation as resolved.
Show resolved Hide resolved
void kmerminhash_remove_many(SourmashKmerMinHash *ptr,
const uint64_t *hashes_ptr,
uintptr_t insize);
Expand Down
9 changes: 9 additions & 0 deletions src/core/src/ffi/minhash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,15 @@ unsafe fn kmerminhash_add_from(ptr: *mut SourmashKmerMinHash, other: *const Sour
}
}

ffi_fn! {
unsafe fn kmerminhash_remove_from(ptr: *mut SourmashKmerMinHash, other: *const SourmashKmerMinHash)
-> Result<()> {
let mh = SourmashKmerMinHash::as_rust_mut(ptr);
let other_mh = SourmashKmerMinHash::as_rust(other);
mh.remove_from(other_mh)
}
}

ffi_fn! {
unsafe fn kmerminhash_count_common(ptr: *const SourmashKmerMinHash, other: *const SourmashKmerMinHash, downsample: bool)
-> Result<u64> {
Expand Down
8 changes: 8 additions & 0 deletions src/core/src/sketch/minhash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,14 @@ impl KmerMinHash {
};
}


Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Extra empty line here, remove it:

Suggested change

pub fn remove_from(&mut self, other: &KmerMinHash) -> Result<(), Error> {
for min in &other.mins {
self.remove_hash(*min);
}
Ok(())
}

pub fn remove_many(&mut self, hashes: &[u64]) -> Result<(), Error> {
for min in hashes {
self.remove_hash(*min);
Expand Down
11 changes: 9 additions & 2 deletions src/sourmash/minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,8 +306,15 @@ def add_many(self, hashes):
self._methodcall(lib.kmerminhash_add_many, list(hashes), len(hashes))

def remove_many(self, hashes):
"Remove many hashes at once; ``hashes`` must be an iterable."
self._methodcall(lib.kmerminhash_remove_many, list(hashes), len(hashes))
"""Remove many hashes from a sketch at once.

``hashes`` can be either an iterable (list, set, etc.), or another
``MinHash`` object.
"""
if isinstance(hashes, MinHash):
self._methodcall(lib.kmerminhash_remove_from, hashes._objptr)
else:
self._methodcall(lib.kmerminhash_remove_many, list(hashes), len(hashes))

def __len__(self):
"Number of hashes."
Expand Down
22 changes: 22 additions & 0 deletions tests/test_minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -1414,6 +1414,28 @@ def test_remove_many(track_abundance):
assert len(a) == 33
assert all(c % 6 != 0 for c in a.hashes)

def test_remove_minhash(track_abundance):
original_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000)
added_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000)
tested_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000)

original_mh.add_many(list(range(101)))
added_mh.add_many(list(range(101,201))) # contains original in it
tested_mh.add_many(list(range(201))) # original + added

# Now we should expect tested_minhash == original_minhash
# Note we are passing a MinHash object instead of an iterable object
tested_mh.remove_many(added_mh)

# Assertion
original_sig = signature.SourmashSignature(original_mh)
tested_sig = signature.SourmashSignature(tested_mh)

# Should pass if the hashes list in the same order
assert original_mh.hashes == tested_mh.hashes
assert len(original_mh) == len(tested_mh)
assert original_sig.md5sum() == tested_sig.md5sum()


def test_add_many(track_abundance):
a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000)
Expand Down