Skip to content

Commit

Permalink
getting things in place for _get_bf
Browse files Browse the repository at this point in the history
use get_bf

use containment directly

bench and update containment

create new function for intersection and union size

keep doing it inplace...

wip simdeez

remove simd
  • Loading branch information
luizirber committed Oct 27, 2020
1 parent b5db252 commit 78ab059
Show file tree
Hide file tree
Showing 6 changed files with 142 additions and 8 deletions.
8 changes: 8 additions & 0 deletions include/sourmash.h
Expand Up @@ -208,6 +208,10 @@ bool kmerminhash_track_abundance(const SourmashKmerMinHash *ptr);

void nodegraph_buffer_free(uint8_t *ptr, uintptr_t insize);

double nodegraph_containment(const SourmashNodegraph *ptr, const SourmashNodegraph *optr);

double nodegraph_containment_mh(const SourmashNodegraph *ptr, const SourmashKmerMinHash *optr);

bool nodegraph_count(SourmashNodegraph *ptr, uint64_t h);

bool nodegraph_count_kmer(SourmashNodegraph *ptr, const char *kmer);
Expand Down Expand Up @@ -238,6 +242,10 @@ uintptr_t nodegraph_ntables(const SourmashNodegraph *ptr);

void nodegraph_save(const SourmashNodegraph *ptr, const char *filename);

double nodegraph_similarity(const SourmashNodegraph *ptr, const SourmashNodegraph *optr);

double nodegraph_similarity_mh(const SourmashNodegraph *ptr, const SourmashKmerMinHash *optr);

const uint8_t *nodegraph_to_buffer(const SourmashNodegraph *ptr,
uint8_t compression,
uintptr_t *size);
Expand Down
20 changes: 20 additions & 0 deletions sourmash/nodegraph.py
Expand Up @@ -49,6 +49,26 @@ def update(self, other):
# converted to a list of ints...)
raise TypeError("Must be a Nodegraph or MinHash")

def containment(self, other):
if isinstance(other, Nodegraph):
return self._methodcall(lib.nodegraph_containment, other._objptr)
elif isinstance(other, MinHash):
return self._methodcall(lib.nodegraph_containment_mh, other._objptr)
else:
# FIXME: we could take sets here too (or anything that can be
# converted to a list of ints...)
raise TypeError("Must be a Nodegraph or MinHash")

def similarity(self, other):
if isinstance(other, Nodegraph):
return self._methodcall(lib.nodegraph_similarity, other._objptr)
elif isinstance(other, MinHash):
return self._methodcall(lib.nodegraph_similarity_mh, other._objptr)
else:
# FIXME: we could take sets here too (or anything that can be
# converted to a list of ints...)
raise TypeError("Must be a Nodegraph or MinHash")

def count(self, h):
if isinstance(h, str):
return self._methodcall(lib.nodegraph_count_kmer, to_bytes(h))
Expand Down
25 changes: 20 additions & 5 deletions sourmash/sbtmh.py
Expand Up @@ -75,18 +75,19 @@ def _max_jaccard_underneath_internal_node(node, query):
This should yield be an upper bound on the Jaccard similarity
for any signature below this point.
"""
query_bf = _get_bf(node, query)
mh = query.minhash

if len(mh) == 0:
return 0.0

# count the maximum number of hash matches beneath this node
matches = node.data.matches(mh)

# J(A, B) = |A intersection B| / |A union B|
# If we use only |A| as denominator, it is the containment
# Because |A| <= |A union B|, it is also an upper bound on the max jaccard
max_score = float(matches) / len(mh)
max_score = query_bf.containment(node.data)

#matches = node.data.matches(mh)
#max_score = float(matches) / len(mh)

return max_score

Expand Down Expand Up @@ -143,6 +144,8 @@ def search_minhashes_containment(node, sig, threshold, results=None, downsample=
if isinstance(node, SigLeaf):
matches = node.data.minhash.count_common(mh, downsample)
else: # Node or Leaf, Nodegraph by minhash comparison
#bf = _get_bf(node, sig)
#matches = bf.containment(node.data) * len(mh)
matches = node.data.matches(mh)

if results is not None:
Expand All @@ -165,11 +168,12 @@ def search(self, node, query, threshold, results=None):
if isinstance(node, SigLeaf):
matches = mh.count_common(node.data.minhash, True)
else: # Nodegraph by minhash comparison
#bf = _get_bf(node, query)
#score = bf.containment(node.data)
matches = node.data.matches(mh)

if not matches:
return 0

score = float(matches) / len(mh)

if score < threshold:
Expand All @@ -187,3 +191,14 @@ def search(self, node, query, threshold, results=None):
return 1

return 0


def _get_bf(node, query):
try:
query_bf = query.bf
except AttributeError:
query_bf = node._factory()
query_bf.update(query.minhash)
query.bf = query_bf

return query_bf
20 changes: 19 additions & 1 deletion src/core/benches/nodegraph.rs
Expand Up @@ -4,6 +4,7 @@ extern crate criterion;
use std::fs::File;
use std::io::{BufWriter, Cursor, Read};

use sourmash::index::Comparable;
use sourmash::sketch::nodegraph::Nodegraph;

use criterion::Criterion;
Expand Down Expand Up @@ -49,5 +50,22 @@ fn save_load(c: &mut Criterion) {
});
}

criterion_group!(nodegraph, save_load);
fn comparable(c: &mut Criterion) {
let mut group = c.benchmark_group("nodegraph");
group.sample_size(10);

let f = File::open("../../tests/test-data/.sbt.v3/internal.0").unwrap();
let ng = Nodegraph::from_reader(f).unwrap();
let ng_2 = ng.clone();

group.bench_function("comparable containment", |b| {
b.iter(|| ng.containment(&ng_2));
});

group.bench_function("comparable similarity", |b| {
b.iter(|| ng.similarity(&ng_2));
});
}

criterion_group!(nodegraph, save_load, comparable);
criterion_main!(nodegraph);
46 changes: 46 additions & 0 deletions src/core/src/ffi/nodegraph.rs
Expand Up @@ -3,6 +3,7 @@ use std::os::raw::c_char;
use std::slice;

use crate::index::sbt::Update;
use crate::index::Comparable;
use crate::sketch::nodegraph::Nodegraph;

use crate::ffi::minhash::SourmashKmerMinHash;
Expand Down Expand Up @@ -157,9 +158,54 @@ pub unsafe extern "C" fn nodegraph_update_mh(
let ng = SourmashNodegraph::as_rust_mut(ptr);
let mh = SourmashKmerMinHash::as_rust(optr);

// FIXME raise an exception properly
mh.update(ng).unwrap();
}

#[no_mangle]
pub unsafe extern "C" fn nodegraph_containment(
ptr: *const SourmashNodegraph,
optr: *const SourmashNodegraph,
) -> f64 {
let ng = SourmashNodegraph::as_rust(ptr);
let ong = SourmashNodegraph::as_rust(optr);

ng.containment(ong)
}

#[no_mangle]
pub unsafe extern "C" fn nodegraph_containment_mh(
ptr: *const SourmashNodegraph,
optr: *const SourmashKmerMinHash,
) -> f64 {
let ng = SourmashNodegraph::as_rust(ptr);
let mh = SourmashKmerMinHash::as_rust(optr);

ng.containment(mh)
}

#[no_mangle]
pub unsafe extern "C" fn nodegraph_similarity(
ptr: *const SourmashNodegraph,
optr: *const SourmashNodegraph,
) -> f64 {
let ng = SourmashNodegraph::as_rust(ptr);
let ong = SourmashNodegraph::as_rust(optr);

ng.similarity(ong)
}

#[no_mangle]
pub unsafe extern "C" fn nodegraph_similarity_mh(
ptr: *const SourmashNodegraph,
optr: *const SourmashKmerMinHash,
) -> f64 {
let ng = SourmashNodegraph::as_rust(ptr);
let mh = SourmashKmerMinHash::as_rust(optr);

ng.similarity(mh)
}

ffi_fn! {
unsafe fn nodegraph_from_path(filename: *const c_char) -> Result<*mut SourmashNodegraph> {
// FIXME use buffer + len instead of c_str
Expand Down
31 changes: 29 additions & 2 deletions src/core/src/sketch/nodegraph.rs
Expand Up @@ -7,6 +7,7 @@ use byteorder::{BigEndian, ByteOrder, LittleEndian, ReadBytesExt, WriteBytesExt}
use fixedbitset::FixedBitSet;

use crate::index::sbt::Update;
use crate::index::Comparable;
use crate::sketch::minhash::KmerMinHash;
use crate::Error;
use crate::HashIntoType;
Expand Down Expand Up @@ -288,8 +289,20 @@ impl Nodegraph {
pub fn unique_kmers(&self) -> usize {
self.unique_kmers
}
}

impl Comparable<&Nodegraph> for Nodegraph {
fn similarity(&self, other: &&Nodegraph) -> f64 {
self.similarity(*other)
}

fn containment(&self, other: &&Nodegraph) -> f64 {
self.containment(*other)
}
}

pub fn similarity(&self, other: &Nodegraph) -> f64 {
impl Comparable<Nodegraph> for Nodegraph {
fn similarity(&self, other: &Nodegraph) -> f64 {
let result: usize = self
.bs
.iter()
Expand All @@ -305,7 +318,7 @@ impl Nodegraph {
result as f64 / size as f64
}

pub fn containment(&self, other: &Nodegraph) -> f64 {
fn containment(&self, other: &Nodegraph) -> f64 {
let result: usize = self
.bs
.iter()
Expand All @@ -317,6 +330,20 @@ impl Nodegraph {
}
}

impl Comparable<KmerMinHash> for Nodegraph {
fn similarity(&self, other: &KmerMinHash) -> f64 {
unimplemented!()
}

fn containment(&self, other: &KmerMinHash) -> f64 {
/*
let result: usize = other.mins().iter().map(|h| self.get(*h)).sum();
result as f64 / self.size() as f64
*/
unimplemented!()
}
}

fn twobit_repr(a: u8) -> HashIntoType {
match a as char {
'A' => 0,
Expand Down

0 comments on commit 78ab059

Please sign in to comment.