From c13ba85b0bebaeaca25d1fcab441260ffa1af732 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sat, 1 Aug 2020 13:50:14 -0700 Subject: [PATCH] getting things in place for _get_bf use get_bf use containment directly bench and update containment create new function for intersection and union size keep doing it inplace... wip simdeez remove simd --- include/sourmash.h | 8 ++++++ src/core/benches/nodegraph.rs | 20 +++++++++++++- src/core/src/ffi/nodegraph.rs | 46 ++++++++++++++++++++++++++++++++ src/core/src/sketch/nodegraph.rs | 31 +++++++++++++++++++-- src/sourmash/nodegraph.py | 20 ++++++++++++++ src/sourmash/sbtmh.py | 29 +++++++++++++++----- 6 files changed, 144 insertions(+), 10 deletions(-) diff --git a/include/sourmash.h b/include/sourmash.h index b5e111b66..361d1663a 100644 --- a/include/sourmash.h +++ b/include/sourmash.h @@ -245,6 +245,10 @@ bool kmerminhash_track_abundance(const SourmashKmerMinHash *ptr); void nodegraph_buffer_free(uint8_t *ptr, uintptr_t insize); +double nodegraph_containment(const SourmashNodegraph *ptr, const SourmashNodegraph *optr); + +double nodegraph_containment_mh(const SourmashNodegraph *ptr, const SourmashKmerMinHash *optr); + bool nodegraph_count(SourmashNodegraph *ptr, uint64_t h); bool nodegraph_count_kmer(SourmashNodegraph *ptr, const char *kmer); @@ -275,6 +279,10 @@ uintptr_t nodegraph_ntables(const SourmashNodegraph *ptr); void nodegraph_save(const SourmashNodegraph *ptr, const char *filename); +double nodegraph_similarity(const SourmashNodegraph *ptr, const SourmashNodegraph *optr); + +double nodegraph_similarity_mh(const SourmashNodegraph *ptr, const SourmashKmerMinHash *optr); + const uint8_t *nodegraph_to_buffer(const SourmashNodegraph *ptr, uint8_t compression, uintptr_t *size); diff --git a/src/core/benches/nodegraph.rs b/src/core/benches/nodegraph.rs index 8bbf4ce3d..5387f0151 100644 --- a/src/core/benches/nodegraph.rs +++ b/src/core/benches/nodegraph.rs @@ -4,6 +4,7 @@ extern crate criterion; use std::fs::File; use std::io::{BufWriter, Cursor, Read}; +use sourmash::index::Comparable; use sourmash::sketch::nodegraph::Nodegraph; use criterion::Criterion; @@ -49,5 +50,22 @@ fn save_load(c: &mut Criterion) { }); } -criterion_group!(nodegraph, save_load); +fn comparable(c: &mut Criterion) { + let mut group = c.benchmark_group("nodegraph"); + group.sample_size(10); + + let f = File::open("../../tests/test-data/.sbt.v3/internal.0").unwrap(); + let ng = Nodegraph::from_reader(f).unwrap(); + let ng_2 = ng.clone(); + + group.bench_function("comparable containment", |b| { + b.iter(|| ng.containment(&ng_2)); + }); + + group.bench_function("comparable similarity", |b| { + b.iter(|| ng.similarity(&ng_2)); + }); +} + +criterion_group!(nodegraph, save_load, comparable); criterion_main!(nodegraph); diff --git a/src/core/src/ffi/nodegraph.rs b/src/core/src/ffi/nodegraph.rs index 7a1510842..4c8333e22 100644 --- a/src/core/src/ffi/nodegraph.rs +++ b/src/core/src/ffi/nodegraph.rs @@ -3,6 +3,7 @@ use std::os::raw::c_char; use std::slice; use crate::index::sbt::Update; +use crate::index::Comparable; use crate::sketch::nodegraph::Nodegraph; use crate::ffi::minhash::SourmashKmerMinHash; @@ -157,9 +158,54 @@ pub unsafe extern "C" fn nodegraph_update_mh( let ng = SourmashNodegraph::as_rust_mut(ptr); let mh = SourmashKmerMinHash::as_rust(optr); + // FIXME raise an exception properly mh.update(ng).unwrap(); } +#[no_mangle] +pub unsafe extern "C" fn nodegraph_containment( + ptr: *const SourmashNodegraph, + optr: *const SourmashNodegraph, +) -> f64 { + let ng = SourmashNodegraph::as_rust(ptr); + let ong = SourmashNodegraph::as_rust(optr); + + ng.containment(ong) +} + +#[no_mangle] +pub unsafe extern "C" fn nodegraph_containment_mh( + ptr: *const SourmashNodegraph, + optr: *const SourmashKmerMinHash, +) -> f64 { + let ng = SourmashNodegraph::as_rust(ptr); + let mh = SourmashKmerMinHash::as_rust(optr); + + ng.containment(mh) +} + +#[no_mangle] +pub unsafe extern "C" fn nodegraph_similarity( + ptr: *const SourmashNodegraph, + optr: *const SourmashNodegraph, +) -> f64 { + let ng = SourmashNodegraph::as_rust(ptr); + let ong = SourmashNodegraph::as_rust(optr); + + ng.similarity(ong) +} + +#[no_mangle] +pub unsafe extern "C" fn nodegraph_similarity_mh( + ptr: *const SourmashNodegraph, + optr: *const SourmashKmerMinHash, +) -> f64 { + let ng = SourmashNodegraph::as_rust(ptr); + let mh = SourmashKmerMinHash::as_rust(optr); + + ng.similarity(mh) +} + ffi_fn! { unsafe fn nodegraph_from_path(filename: *const c_char) -> Result<*mut SourmashNodegraph> { // FIXME use buffer + len instead of c_str diff --git a/src/core/src/sketch/nodegraph.rs b/src/core/src/sketch/nodegraph.rs index 122e1159a..12be5d516 100644 --- a/src/core/src/sketch/nodegraph.rs +++ b/src/core/src/sketch/nodegraph.rs @@ -7,6 +7,7 @@ use byteorder::{BigEndian, ByteOrder, LittleEndian, ReadBytesExt, WriteBytesExt} use fixedbitset::FixedBitSet; use crate::index::sbt::Update; +use crate::index::Comparable; use crate::sketch::minhash::KmerMinHash; use crate::Error; use crate::HashIntoType; @@ -288,8 +289,20 @@ impl Nodegraph { pub fn unique_kmers(&self) -> usize { self.unique_kmers } +} + +impl Comparable<&Nodegraph> for Nodegraph { + fn similarity(&self, other: &&Nodegraph) -> f64 { + self.similarity(*other) + } + + fn containment(&self, other: &&Nodegraph) -> f64 { + self.containment(*other) + } +} - pub fn similarity(&self, other: &Nodegraph) -> f64 { +impl Comparable for Nodegraph { + fn similarity(&self, other: &Nodegraph) -> f64 { let result: usize = self .bs .iter() @@ -305,7 +318,7 @@ impl Nodegraph { result as f64 / size as f64 } - pub fn containment(&self, other: &Nodegraph) -> f64 { + fn containment(&self, other: &Nodegraph) -> f64 { let result: usize = self .bs .iter() @@ -317,6 +330,20 @@ impl Nodegraph { } } +impl Comparable for Nodegraph { + fn similarity(&self, other: &KmerMinHash) -> f64 { + unimplemented!() + } + + fn containment(&self, other: &KmerMinHash) -> f64 { + /* + let result: usize = other.mins().iter().map(|h| self.get(*h)).sum(); + result as f64 / self.size() as f64 + */ + unimplemented!() + } +} + fn twobit_repr(a: u8) -> HashIntoType { match a as char { 'A' => 0, diff --git a/src/sourmash/nodegraph.py b/src/sourmash/nodegraph.py index 8faa2eb87..53e2eced6 100644 --- a/src/sourmash/nodegraph.py +++ b/src/sourmash/nodegraph.py @@ -49,6 +49,26 @@ def update(self, other): # converted to a list of ints...) raise TypeError("Must be a Nodegraph or MinHash") + def containment(self, other): + if isinstance(other, Nodegraph): + return self._methodcall(lib.nodegraph_containment, other._objptr) + elif isinstance(other, MinHash): + return self._methodcall(lib.nodegraph_containment_mh, other._objptr) + else: + # FIXME: we could take sets here too (or anything that can be + # converted to a list of ints...) + raise TypeError("Must be a Nodegraph or MinHash") + + def similarity(self, other): + if isinstance(other, Nodegraph): + return self._methodcall(lib.nodegraph_similarity, other._objptr) + elif isinstance(other, MinHash): + return self._methodcall(lib.nodegraph_similarity_mh, other._objptr) + else: + # FIXME: we could take sets here too (or anything that can be + # converted to a list of ints...) + raise TypeError("Must be a Nodegraph or MinHash") + def count(self, h): if isinstance(h, str): return self._methodcall(lib.nodegraph_count_kmer, to_bytes(h)) diff --git a/src/sourmash/sbtmh.py b/src/sourmash/sbtmh.py index 6cbc33658..2bd0182ad 100644 --- a/src/sourmash/sbtmh.py +++ b/src/sourmash/sbtmh.py @@ -75,18 +75,19 @@ def _max_jaccard_underneath_internal_node(node, query): This should yield be an upper bound on the Jaccard similarity for any signature below this point. """ + query_bf = _get_bf(node, query) mh = query.minhash if len(mh) == 0: return 0.0 - # count the maximum number of hash matches beneath this node - matches = node.data.matches(mh) - # J(A, B) = |A intersection B| / |A union B| # If we use only |A| as denominator, it is the containment # Because |A| <= |A union B|, it is also an upper bound on the max jaccard - max_score = float(matches) / len(mh) + max_score = query_bf.containment(node.data) + + #matches = node.data.matches(mh) + #max_score = float(matches) / len(mh) return max_score @@ -103,7 +104,7 @@ def search_minhashes(node, sig, threshold, results=None): score = node.data.minhash.similarity(sig.minhash) else: # Node minhash comparison #query_bf = _get_bf(node, sig) - sig_mh = query.minhash + sig_mh = sig.minhash if len(sig_mh) == 0: return 0.0 @@ -137,7 +138,7 @@ def search(self, node, sig, threshold, results=None): score = node.data.minhash.similarity(sig.minhash) else: # internal object, not leaf. #query_bf = _get_bf(node, sig) - sig_mh = query.minhash + sig_mh = sig.minhash if len(sig_mh) == 0: return 0.0 @@ -171,6 +172,8 @@ def search_minhashes_containment(node, sig, threshold, results=None, downsample= if isinstance(node, SigLeaf): matches = node.data.minhash.count_common(mh, downsample) else: # Node or Leaf, Nodegraph by minhash comparison + #bf = _get_bf(node, sig) + #matches = bf.containment(node.data) * len(mh) matches = node.data.matches(mh) if len(mh) and float(matches) / len(mh) >= threshold: @@ -220,11 +223,12 @@ def search(self, node, query, threshold, results=None): if isinstance(node, SigLeaf): matches = mh.count_common(node.data.minhash, True) else: # Nodegraph by minhash comparison + #bf = _get_bf(node, query) + #score = bf.containment(node.data) matches = node.data.matches(mh) if not matches: return 0 - score = float(matches) / len(mh) if score < threshold: @@ -238,3 +242,14 @@ def search(self, node, query, threshold, results=None): return 1 return 0 + + +def _get_bf(node, query): + try: + query_bf = query.bf + except AttributeError: + query_bf = node._factory() + query_bf.update(query.minhash) + query.bf = query_bf + + return query_bf