From 299bb3874cb60a6dfa8baed51843f95d2e91e020 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sat, 1 Aug 2020 13:50:14 -0700 Subject: [PATCH] getting things in place for _get_bf use get_bf use containment directly bench and update containment create new function for intersection and union size keep doing it inplace... wip simdeez remove simd --- include/sourmash.h | 4 +++ sourmash/nodegraph.py | 20 ++++++++++++++ sourmash/sbtmh.py | 25 +++++++++++++---- src/core/benches/nodegraph.rs | 20 +++++++++++++- src/core/src/ffi/nodegraph.rs | 46 ++++++++++++++++++++++++++++++++ src/core/src/sketch/nodegraph.rs | 31 +++++++++++++++++++-- 6 files changed, 138 insertions(+), 8 deletions(-) diff --git a/include/sourmash.h b/include/sourmash.h index 401c6b3407..c0a36b570b 100644 --- a/include/sourmash.h +++ b/include/sourmash.h @@ -208,6 +208,10 @@ bool kmerminhash_track_abundance(const SourmashKmerMinHash *ptr); void nodegraph_buffer_free(uint8_t *ptr, uintptr_t insize); +double nodegraph_containment(SourmashNodegraph *ptr, const SourmashNodegraph *optr); + +double nodegraph_containment_mh(SourmashNodegraph *ptr, const SourmashKmerMinHash *optr); + bool nodegraph_count(SourmashNodegraph *ptr, uint64_t h); bool nodegraph_count_kmer(SourmashNodegraph *ptr, const char *kmer); diff --git a/sourmash/nodegraph.py b/sourmash/nodegraph.py index 8faa2eb874..53e2eced6a 100644 --- a/sourmash/nodegraph.py +++ b/sourmash/nodegraph.py @@ -49,6 +49,26 @@ def update(self, other): # converted to a list of ints...) raise TypeError("Must be a Nodegraph or MinHash") + def containment(self, other): + if isinstance(other, Nodegraph): + return self._methodcall(lib.nodegraph_containment, other._objptr) + elif isinstance(other, MinHash): + return self._methodcall(lib.nodegraph_containment_mh, other._objptr) + else: + # FIXME: we could take sets here too (or anything that can be + # converted to a list of ints...) + raise TypeError("Must be a Nodegraph or MinHash") + + def similarity(self, other): + if isinstance(other, Nodegraph): + return self._methodcall(lib.nodegraph_similarity, other._objptr) + elif isinstance(other, MinHash): + return self._methodcall(lib.nodegraph_similarity_mh, other._objptr) + else: + # FIXME: we could take sets here too (or anything that can be + # converted to a list of ints...) + raise TypeError("Must be a Nodegraph or MinHash") + def count(self, h): if isinstance(h, str): return self._methodcall(lib.nodegraph_count_kmer, to_bytes(h)) diff --git a/sourmash/sbtmh.py b/sourmash/sbtmh.py index d7db068466..bfbfd04045 100644 --- a/sourmash/sbtmh.py +++ b/sourmash/sbtmh.py @@ -75,18 +75,19 @@ def _max_jaccard_underneath_internal_node(node, query): This should yield be an upper bound on the Jaccard similarity for any signature below this point. """ + query_bf = _get_bf(node, query) mh = query.minhash if len(mh) == 0: return 0.0 - # count the maximum number of hash matches beneath this node - matches = node.data.matches(mh) - # J(A, B) = |A intersection B| / |A union B| # If we use only |A| as denominator, it is the containment # Because |A| <= |A union B|, it is also an upper bound on the max jaccard - max_score = float(matches) / len(mh) + max_score = query_bf.containment(node.data) + + #matches = node.data.matches(mh) + #max_score = float(matches) / len(mh) return max_score @@ -143,6 +144,8 @@ def search_minhashes_containment(node, sig, threshold, results=None, downsample= if isinstance(node, SigLeaf): matches = node.data.minhash.count_common(mh, downsample) else: # Node or Leaf, Nodegraph by minhash comparison + #bf = _get_bf(node, sig) + #matches = bf.containment(node.data) * len(mh) matches = node.data.matches(mh) if results is not None: @@ -165,11 +168,12 @@ def search(self, node, query, threshold, results=None): if isinstance(node, SigLeaf): matches = mh.count_common(node.data.minhash, True) else: # Nodegraph by minhash comparison + #bf = _get_bf(node, query) + #score = bf.containment(node.data) matches = node.data.matches(mh) if not matches: return 0 - score = float(matches) / len(mh) if score < threshold: @@ -187,3 +191,14 @@ def search(self, node, query, threshold, results=None): return 1 return 0 + + +def _get_bf(node, query): + try: + query_bf = query.bf + except AttributeError: + query_bf = node._factory() + query_bf.update(query.minhash) + query.bf = query_bf + + return query_bf diff --git a/src/core/benches/nodegraph.rs b/src/core/benches/nodegraph.rs index 8bbf4ce3d5..5387f01515 100644 --- a/src/core/benches/nodegraph.rs +++ b/src/core/benches/nodegraph.rs @@ -4,6 +4,7 @@ extern crate criterion; use std::fs::File; use std::io::{BufWriter, Cursor, Read}; +use sourmash::index::Comparable; use sourmash::sketch::nodegraph::Nodegraph; use criterion::Criterion; @@ -49,5 +50,22 @@ fn save_load(c: &mut Criterion) { }); } -criterion_group!(nodegraph, save_load); +fn comparable(c: &mut Criterion) { + let mut group = c.benchmark_group("nodegraph"); + group.sample_size(10); + + let f = File::open("../../tests/test-data/.sbt.v3/internal.0").unwrap(); + let ng = Nodegraph::from_reader(f).unwrap(); + let ng_2 = ng.clone(); + + group.bench_function("comparable containment", |b| { + b.iter(|| ng.containment(&ng_2)); + }); + + group.bench_function("comparable similarity", |b| { + b.iter(|| ng.similarity(&ng_2)); + }); +} + +criterion_group!(nodegraph, save_load, comparable); criterion_main!(nodegraph); diff --git a/src/core/src/ffi/nodegraph.rs b/src/core/src/ffi/nodegraph.rs index 29a1c3e84c..7d7d578240 100644 --- a/src/core/src/ffi/nodegraph.rs +++ b/src/core/src/ffi/nodegraph.rs @@ -3,6 +3,7 @@ use std::os::raw::c_char; use std::slice; use crate::index::sbt::Update; +use crate::index::Comparable; use crate::sketch::nodegraph::Nodegraph; use crate::ffi::minhash::SourmashKmerMinHash; @@ -157,9 +158,54 @@ pub unsafe extern "C" fn nodegraph_update_mh( let ng = SourmashNodegraph::as_rust_mut(ptr); let mh = SourmashKmerMinHash::as_rust(optr); + // FIXME raise an exception properly mh.update(ng).unwrap(); } +#[no_mangle] +pub unsafe extern "C" fn nodegraph_containment( + ptr: *const SourmashNodegraph, + optr: *const SourmashNodegraph, +) -> f64 { + let ng = SourmashNodegraph::as_rust(ptr); + let ong = SourmashNodegraph::as_rust(optr); + + ng.containment(ong) +} + +#[no_mangle] +pub unsafe extern "C" fn nodegraph_containment_mh( + ptr: *const SourmashNodegraph, + optr: *const SourmashKmerMinHash, +) -> f64 { + let ng = SourmashNodegraph::as_rust(ptr); + let mh = SourmashKmerMinHash::as_rust(optr); + + ng.containment(mh) +} + +#[no_mangle] +pub unsafe extern "C" fn nodegraph_similarity( + ptr: *const SourmashNodegraph, + optr: *const SourmashNodegraph, +) -> f64 { + let ng = SourmashNodegraph::as_rust(ptr); + let ong = SourmashNodegraph::as_rust(optr); + + ng.similarity(ong) +} + +#[no_mangle] +pub unsafe extern "C" fn nodegraph_similarity_mh( + ptr: *const SourmashNodegraph, + optr: *const SourmashKmerMinHash, +) -> f64 { + let ng = SourmashNodegraph::as_rust(ptr); + let mh = SourmashKmerMinHash::as_rust(optr); + + ng.similarity(mh) +} + ffi_fn! { unsafe fn nodegraph_from_path(filename: *const c_char) -> Result<*mut SourmashNodegraph> { // FIXME use buffer + len instead of c_str diff --git a/src/core/src/sketch/nodegraph.rs b/src/core/src/sketch/nodegraph.rs index 34a34a8ca3..e4eea63bda 100644 --- a/src/core/src/sketch/nodegraph.rs +++ b/src/core/src/sketch/nodegraph.rs @@ -7,6 +7,7 @@ use byteorder::{BigEndian, ByteOrder, LittleEndian, ReadBytesExt, WriteBytesExt} use fixedbitset::FixedBitSet; use crate::index::sbt::Update; +use crate::index::Comparable; use crate::sketch::minhash::KmerMinHash; use crate::Error; use crate::HashIntoType; @@ -288,8 +289,20 @@ impl Nodegraph { pub fn unique_kmers(&self) -> usize { self.unique_kmers } +} + +impl Comparable<&Nodegraph> for Nodegraph { + fn similarity(&self, other: &&Nodegraph) -> f64 { + self.similarity(*other) + } + + fn containment(&self, other: &&Nodegraph) -> f64 { + self.containment(*other) + } +} - pub fn similarity(&self, other: &Nodegraph) -> f64 { +impl Comparable for Nodegraph { + fn similarity(&self, other: &Nodegraph) -> f64 { let result: usize = self .bs .iter() @@ -305,7 +318,7 @@ impl Nodegraph { result as f64 / size as f64 } - pub fn containment(&self, other: &Nodegraph) -> f64 { + fn containment(&self, other: &Nodegraph) -> f64 { let result: usize = self .bs .iter() @@ -317,6 +330,20 @@ impl Nodegraph { } } +impl Comparable for Nodegraph { + fn similarity(&self, other: &KmerMinHash) -> f64 { + unimplemented!() + } + + fn containment(&self, other: &KmerMinHash) -> f64 { + /* + let result: usize = other.mins().iter().map(|h| self.get(*h)).sum(); + result as f64 / self.size() as f64 + */ + unimplemented!() + } +} + fn twobit_repr(a: u8) -> HashIntoType { match a as char { 'A' => 0,