From 0d69b7973e1b4e690933b27aeb8d11becc876e19 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sun, 7 Jul 2019 00:04:54 +0000 Subject: [PATCH] Big refactor on the rust codebase (#692) save semi-working initial bigsi Rename Leaf to Dataset remove storage from readdata trait basic search working start best_only UKHS save and load, expose draff command move ffi to mod dir split ffi functions initial draff command keeping track of unique kmers with a Nodegraph Fix overcounting, add HLL storage make tests deterministic move Signature from root to submodule move KmerMinHash into submodule fix finch make index consistent with other submodules starting support for multiple signature types move add_sequence and check_compatible to SigsTrait use unimplemented if something is missing initial docs and warnings cleanup move sbt into submodule reorganize sbt module insertion working for SBT Replace derive_builder with typed_builder using fastx parser from rust-bio search saves output Rename Signatures enum to Sketch Move nodegraph into sketch Storage args are a proper enum now explicit sig and dataset conversion Use syntax inspired by czbiohub/extract_kmers for codon table (original commit: https://github.com/czbiohub/extract_kmers/blob/d036145000bff96454ec383b193f8913fc5e2b16/src/codon_table.rs) demo command for counting unique hashes in an SBT implement sbt v4 parsing, and clean up clippy warnings Start replacing the Factory struct with an enum move smrs bin to src/bin split draff commands from smrs fix v4 loading --- Cargo.toml | 18 +- benches/index.rs | 95 +-- include/sourmash.h | 6 + ocf/src/lib.rs | 14 +- src/bin/draff.rs | 60 ++ src/bin/draff.yml | 66 +++ src/bin/smrs.rs | 448 ++++++++++++++ src/{ => bin}/sourmash.yml | 17 +- src/cmd.rs | 179 ++++++ src/errors.rs | 11 + src/ffi.rs | 603 ------------------- src/ffi/minhash.rs | 336 +++++++++++ src/ffi/mod.rs | 22 + src/ffi/signature.rs | 289 +++++++++ src/from.rs | 89 ++- src/index.rs | 161 ----- src/index/bigsi.rs | 228 +++++++ src/index/linear.rs | 167 +++++- src/index/mod.rs | 245 ++++++++ src/index/sbt.rs | 602 ------------------- src/index/sbt/mhbt.rs | 110 ++++ src/index/sbt/mod.rs | 924 +++++++++++++++++++++++++++++ src/index/sbt/ukhs.rs | 129 ++++ src/index/search.rs | 21 + src/index/storage.rs | 116 +++- src/lib.rs | 815 +------------------------ src/main.rs | 274 --------- src/signature.rs | 293 +++++++++ src/sketch/minhash.rs | 687 +++++++++++++++++++++ src/sketch/mod.rs | 15 + src/{index => sketch}/nodegraph.rs | 28 +- src/sketch/ukhs.rs | 604 +++++++++++++++++++ tests/finch.rs | 87 --- tests/minhash.rs | 25 +- tests/signature.rs | 2 +- tests/test-data/v4.sbt.json | 1 + 36 files changed, 5160 insertions(+), 2627 deletions(-) create mode 100644 src/bin/draff.rs create mode 100644 src/bin/draff.yml create mode 100644 src/bin/smrs.rs rename src/{ => bin}/sourmash.yml (89%) create mode 100644 src/cmd.rs delete mode 100644 src/ffi.rs create mode 100644 src/ffi/minhash.rs create mode 100644 src/ffi/mod.rs create mode 100644 src/ffi/signature.rs delete mode 100644 src/index.rs create mode 100644 src/index/bigsi.rs create mode 100644 src/index/mod.rs delete mode 100644 src/index/sbt.rs create mode 100644 src/index/sbt/mhbt.rs create mode 100644 src/index/sbt/mod.rs create mode 100644 src/index/sbt/ukhs.rs delete mode 100644 src/main.rs create mode 100644 src/signature.rs create mode 100644 src/sketch/minhash.rs create mode 100644 src/sketch/mod.rs rename src/{index => sketch}/nodegraph.rs (97%) create mode 100644 src/sketch/ukhs.rs create mode 100644 tests/test-data/v4.sbt.json diff --git a/Cargo.toml b/Cargo.toml index fc8e7bbac..72aafe064 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sourmash" -version = "0.2.3" +version = "0.3.0" authors = ["Luiz Irber "] description = "MinHash sketches for genomic data" repository = "https://github.com/luizirber/sourmash-rust" @@ -17,11 +17,6 @@ bench = false [profile.release] lto=true -[[bin]] -bench = false -path = "src/main.rs" -name = "smrs" - [features] from-finch = ["finch", "needletail"] @@ -31,11 +26,9 @@ from-finch = ["finch", "needletail"] #cbindgen = "~0.6.7" [dependencies] -backtrace = "=0.3.9" # https://github.com/alexcrichton/backtrace-rs/issues/147 byteorder = "^1.2" cfg-if = "0.1" clap = { version = "~2.32", features = ["yaml"] } -derive_builder = "^0.7" env_logger = "0.6.0" exitfailure = "0.5.1" failure = "0.1.3" @@ -52,6 +45,13 @@ needletail = { version = "~0.2.1", optional = true } serde = "1.0" serde_derive = "~1.0.58" serde_json = "1.0.2" +ukhs = "0.3.4" +bio = { git = "https://github.com/luizirber/rust-bio", branch = "feature/fastx_reader" } +primal = "0.2.3" +pdatastructs = "0.5.0" +itertools = "0.8.0" +typed-builder = "0.3.0" +csv = "1.0.7" [target.'cfg(target_arch = "wasm32")'.dependencies.wasm-bindgen] version = "^0.2" @@ -67,6 +67,8 @@ features = ["bz2"] proptest = "^0.8" criterion = "^0.2" rand = "^0.5" +tempfile = "3" +assert_matches = "1.2" [[bench]] name = "index" diff --git a/benches/index.rs b/benches/index.rs index b0e71115c..2ea53a5c7 100644 --- a/benches/index.rs +++ b/benches/index.rs @@ -4,79 +4,98 @@ extern crate criterion; use std::path::PathBuf; use criterion::{Bencher, Criterion, Fun}; -use sourmash::index::linear::LinearIndexBuilder; -use sourmash::index::nodegraph::Nodegraph; -use sourmash::index::sbt::{Node, MHBT, SBT}; -use sourmash::index::search::search_minhashes; -use sourmash::index::{Index, Leaf}; -use sourmash::Signature; +use sourmash::index::bigsi::BIGSI; +use sourmash::index::linear::LinearIndex; +use sourmash::index::storage::ReadData; +use sourmash::index::MHBT; +use sourmash::index::{Dataset, Index}; +use sourmash::signature::Signature; fn find_small_bench(c: &mut Criterion) { let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); filename.push("tests/test-data/v5.sbt.json"); - let sbt: MHBT = SBT::from_path(filename).expect("Loading error"); + let sbt = MHBT::from_path(filename).expect("Loading error"); - let leaf: Leaf = (*sbt.leaves().first().unwrap()).clone(); + let leaf: Dataset = (*sbt.datasets().first().unwrap()).clone(); - let mut linear = LinearIndexBuilder::default() - .storage(sbt.storage()) - .build() - .unwrap(); - for l in &sbt.leaves() { + let mut linear = LinearIndex::builder().storage(sbt.storage()).build(); + + for l in &sbt.datasets() { linear.insert(l); } + let mut bigsi = BIGSI::new(10000, 10); + for l in &sbt.datasets() { + let data = l.data().unwrap(); + bigsi.insert(data); + } + let sbt_find = Fun::new( - "sbt_find", - move |b: &mut Bencher, leaf: &Leaf| { - b.iter(|| sbt.find(search_minhashes, leaf, 0.1)) - }, + "sbt_search", + move |b: &mut Bencher, leaf: &Dataset| b.iter(|| sbt.search(leaf, 0.1, false)), ); let linear_find = Fun::new( - "linear_find", - move |b: &mut Bencher, leaf: &Leaf| { - b.iter(|| linear.find(search_minhashes, leaf, 0.1)) + "linear_search", + move |b: &mut Bencher, leaf: &Dataset| { + b.iter(|| linear.search(leaf, 0.1, false)) + }, + ); + + let bigsi_find = Fun::new( + "bigsi_search", + move |b: &mut Bencher, leaf: &Dataset| { + let data = leaf.data().unwrap(); + b.iter(|| bigsi.search(data, 0.1, false)) }, ); - let functions = vec![sbt_find, linear_find]; - c.bench_functions("find_small", functions, leaf); + let functions = vec![sbt_find, linear_find, bigsi_find]; + c.bench_functions("search_small", functions, leaf); } fn find_subset_bench(c: &mut Criterion) { let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); filename.push("tests/test-data/subset.sbt.json"); - let sbt: MHBT = SBT::from_path(filename).expect("Loading error"); + let sbt = MHBT::from_path(filename).expect("Loading error"); - let leaf: Leaf = (*sbt.leaves().first().unwrap()).clone(); + let leaf: Dataset = (*sbt.datasets().first().unwrap()).clone(); - let mut linear = LinearIndexBuilder::default() - .storage(sbt.storage()) - .build() - .unwrap(); - for l in &sbt.leaves() { + let mut linear = LinearIndex::builder().storage(sbt.storage()).build(); + for l in &sbt.datasets() { linear.insert(l); } + let mut bigsi = BIGSI::new(10000, 10); + for l in &sbt.datasets() { + let data = l.data().unwrap(); + bigsi.insert(data); + } + let sbt_find = Fun::new( - "sbt_find", - move |b: &mut Bencher, leaf: &Leaf| { - b.iter(|| sbt.find(search_minhashes, leaf, 0.1)) - }, + "sbt_search", + move |b: &mut Bencher, leaf: &Dataset| b.iter(|| sbt.search(leaf, 0.1, false)), ); let linear_find = Fun::new( - "linear_find", - move |b: &mut Bencher, leaf: &Leaf| { - b.iter(|| linear.find(search_minhashes, leaf, 0.1)) + "linear_search", + move |b: &mut Bencher, leaf: &Dataset| { + b.iter(|| linear.search(leaf, 0.1, false)) + }, + ); + + let bigsi_find = Fun::new( + "bigsi_search", + move |b: &mut Bencher, leaf: &Dataset| { + let data = leaf.data().unwrap(); + b.iter(|| bigsi.search(data, 0.1, false)) }, ); - let functions = vec![sbt_find, linear_find]; - c.bench_functions("find_subset", functions, leaf); + let functions = vec![sbt_find, linear_find, bigsi_find]; + c.bench_functions("search_subset", functions, leaf); } criterion_group!(benches, find_small_bench, find_subset_bench); diff --git a/include/sourmash.h b/include/sourmash.h index 1412f9dea..c91155f13 100644 --- a/include/sourmash.h +++ b/include/sourmash.h @@ -51,6 +51,12 @@ void kmerminhash_add_sequence(KmerMinHash *ptr, const char *sequence, bool force void kmerminhash_add_word(KmerMinHash *ptr, const char *word); +void kmerminhash_remove_hash(KmerMinHash *ptr, uint64_t h); + +void kmerminhash_remove_many(KmerMinHash *ptr, + const uint64_t *hashes_ptr, + uintptr_t insize); + double kmerminhash_compare(KmerMinHash *ptr, const KmerMinHash *other); uint64_t kmerminhash_count_common(KmerMinHash *ptr, const KmerMinHash *other); diff --git a/ocf/src/lib.rs b/ocf/src/lib.rs index baf07a9d1..f9662244f 100644 --- a/ocf/src/lib.rs +++ b/ocf/src/lib.rs @@ -87,7 +87,8 @@ pub fn get_readable(input_name: &str) -> Box { match input_name { "-" => Box::new(BufReader::new(io::stdin())), _ => Box::new(BufReader::new( - File::open(input_name).expect(&format!("Can't open input file {}", input_name)), + File::open(input_name) + .unwrap_or_else(|_| panic!("Can't open input file {}", input_name)), )), } } @@ -100,16 +101,16 @@ fn get_compression(mut in_stream: Box) -> CompressionFormat { .expect("Error durring reading first bit of file"); let mut five_bit_val: u64 = 0; - for i in 0..5 { - five_bit_val |= (buf[i] as u64) << 8 * (4 - i); + for (i, item) in buf.iter().enumerate().take(5) { + five_bit_val |= (u64::from(*item)) << (8 * (4 - i)); } if CompressionFormat::from_u64(five_bit_val) == Some(CompressionFormat::Lzma) { return CompressionFormat::Lzma; } let mut two_bit_val: u64 = 0; - for i in 0..2 { - two_bit_val |= (buf[i] as u64) << 8 * (1 - i); + for (i, item) in buf.iter().enumerate().take(2) { + two_bit_val |= (u64::from(*item)) << (8 * (1 - i)); } match CompressionFormat::from_u64(two_bit_val) { @@ -213,7 +214,8 @@ fn get_writable(output_name: &str) -> Box { match output_name { "-" => Box::new(BufWriter::new(io::stdout())), _ => Box::new(BufWriter::new( - File::create(output_name).expect(&format!("Can't open output file {}", output_name)), + File::create(output_name) + .unwrap_or_else(|_| panic!("Can't open output file {}", output_name)), )), } } diff --git a/src/bin/draff.rs b/src/bin/draff.rs new file mode 100644 index 000000000..e4f43c838 --- /dev/null +++ b/src/bin/draff.rs @@ -0,0 +1,60 @@ +use clap::{load_yaml, App}; +use exitfailure::ExitFailure; +//use human_panic::setup_panic; +use sourmash::cmd::{draff_compare, draff_index, draff_search, draff_signature}; + +fn main() -> Result<(), ExitFailure> { + //setup_panic!(); + + env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init(); + + let yml = load_yaml!("draff.yml"); + let m = App::from_yaml(yml).get_matches(); + + match m.subcommand_name() { + Some("compute") => { + let cmd = m.subcommand_matches("compute").unwrap(); + let inputs = cmd + .values_of("inputs") + .map(|vals| vals.collect::>()) + .unwrap(); + + let ksize: usize = cmd.value_of("ksize").unwrap().parse().unwrap(); + let wsize: usize = cmd.value_of("wsize").unwrap().parse().unwrap(); + + draff_signature(inputs, ksize, wsize)?; + } + Some("search") => { + let cmd = m.subcommand_matches("search").unwrap(); + + let index: &str = cmd.value_of("index").unwrap(); + let query: &str = cmd.value_of("query").unwrap(); + + draff_search(index, query)?; + } + Some("compare") => { + let cmd = m.subcommand_matches("compare").unwrap(); + let inputs = cmd + .values_of("inputs") + .map(|vals| vals.collect::>()) + .unwrap(); + + draff_compare(inputs)?; + } + Some("index") => { + let cmd = m.subcommand_matches("index").unwrap(); + let inputs = cmd + .values_of("inputs") + .map(|vals| vals.collect::>()) + .unwrap(); + + let output: &str = cmd.value_of("output").unwrap(); + + draff_index(inputs, output)?; + } + _ => { + println!("{:?}", m); + } + } + Ok(()) +} diff --git a/src/bin/draff.yml b/src/bin/draff.yml new file mode 100644 index 000000000..69c1c7139 --- /dev/null +++ b/src/bin/draff.yml @@ -0,0 +1,66 @@ +name: draff +version: "0.0.1" +about: "draff signature commands" +author: Luiz Irber + +settings: + - SubcommandRequiredElseHelp + +subcommands: + - compute: + about: create a draff signature + settings: + - ArgRequiredElseHelp + args: + - ksize: + help: ksize + short: K + default_value: "9" + takes_value: true + required: false + - wsize: + help: window size + short: W + default_value: "31" + takes_value: true + required: false + - output: + help: alternative output file + short: o + takes_value: true + required: false + - inputs: + help: FASTA files + multiple: true + - index: + about: create a draff index + settings: + - ArgRequiredElseHelp + args: + - output: + help: alternative output file + short: o + takes_value: true + required: false + - inputs: + help: draff signatures + multiple: true + - compare: + about: compare draff signatures + settings: + - ArgRequiredElseHelp + args: + - inputs: + help: draff signatures + multiple: true + - search: + about: search a draff index + settings: + - ArgRequiredElseHelp + args: + - index: + help: index to search + required: true + - query: + help: draff signature to search + required: true diff --git a/src/bin/smrs.rs b/src/bin/smrs.rs new file mode 100644 index 000000000..d0ea0b084 --- /dev/null +++ b/src/bin/smrs.rs @@ -0,0 +1,448 @@ +use std::fs::File; +use std::io; +use std::path::Path; +use std::rc::Rc; + +use clap::{load_yaml, App}; +use exitfailure::ExitFailure; +use failure::Error; +//use human_panic::setup_panic; +use lazy_init::Lazy; +use log::{info, LevelFilter}; +use ocf::{get_output, CompressionFormat}; +use serde::ser::SerializeStruct; +use serde::{Serialize, Serializer}; + +use sourmash::cmd::{ + count_unique, draff_compare, draff_index, draff_search, draff_signature, prepare, +}; +use sourmash::index::linear::LinearIndex; +use sourmash::index::sbt::scaffold; +use sourmash::index::search::{ + search_minhashes, search_minhashes_containment, search_minhashes_find_best, +}; +use sourmash::index::{Comparable, Dataset, Index, MHBT}; +use sourmash::signature::{Signature, SigsTrait}; +use sourmash::sketch::Sketch; + +struct Query { + data: T, +} + +impl Query { + fn ksize(&self) -> u64 { + // TODO: select the correct signature + self.data.signatures[0].ksize() as u64 + } + + fn moltype(&self) -> String { + // TODO: this might panic + match &self.data.signatures[0] { + Sketch::MinHash(mh) => { + if mh.is_protein() { + "protein".into() + } else { + "DNA".into() + } + } + Sketch::UKHS(_) => { + // TODO: draff only supports dna for now + "DNA".into() + } + } + } + + fn name(&self) -> String { + self.data.name().clone() + } +} + +impl From> for Dataset { + fn from(other: Query) -> Dataset { + let data = Lazy::new(); + data.get_or_create(|| other.data); + + Dataset::builder() + .data(Rc::new(data)) + .filename("") + .name("") + .metadata("") + .storage(None) + .build() + } +} + +fn load_query_signature( + query: &str, + ksize: usize, + moltype: Option<&str>, + scaled: Option, +) -> Result, Error> { + let mut reader = io::BufReader::new(File::open(query)?); + let sigs = Signature::load_signatures(&mut reader, ksize, moltype, scaled)?; + + //dbg!(&sigs); + // TODO: what if we have more than one left? + let data = sigs[0].clone(); + + Ok(Query { data }) +} + +struct Database { + data: Indices, + path: String, +} + +enum Indices { + MHBT(MHBT), + LinearIndex(LinearIndex>), +} + +impl Index for Database { + type Item = Dataset; + + fn find( + &self, + search_fn: F, + sig: &Self::Item, + threshold: f64, + ) -> Result, Error> + where + F: Fn(&dyn Comparable, &Self::Item, f64) -> bool, + { + match &self.data { + Indices::MHBT(data) => data.find(search_fn, sig, threshold), + Indices::LinearIndex(data) => data.find(search_fn, sig, threshold), + } + } + + fn insert(&mut self, node: &Self::Item) -> Result<(), Error> { + match &mut self.data { + Indices::MHBT(data) => data.insert(node), + Indices::LinearIndex(data) => data.insert(node), + } + } + + fn save>(&self, path: P) -> Result<(), Error> { + match &self.data { + Indices::MHBT(data) => data.save(path), + Indices::LinearIndex(data) => data.save(path), + } + } + + fn load>(_path: P) -> Result<(), Error> { + unimplemented!(); + } + + fn datasets(&self) -> Vec { + match &self.data { + Indices::MHBT(data) => data.datasets(), + Indices::LinearIndex(data) => data.datasets(), + } + } +} + +fn load_sbts_and_sigs( + filenames: &[&str], + query: &Query, + _containment: bool, + traverse: bool, +) -> Result, Error> { + let mut dbs = Vec::default(); + + let _ksize = query.ksize(); + let _moltype = query.moltype(); + + let n_signatures = 0; + let mut n_databases = 0; + + for path in filenames { + if traverse && Path::new(path).is_dir() { + continue; + } + + if let Ok(data) = MHBT::from_path(path) { + // TODO: check compatible + dbs.push(Database { + data: Indices::MHBT(data), + path: String::from(*path), + }); + info!("loaded SBT {}", path); + n_databases += 1; + continue; + } else if let Ok(data) = LinearIndex::>::from_path(path) { + // TODO: check compatible + dbs.push(Database { + data: Indices::LinearIndex(data), + path: String::from(*path), + }); + info!("loaded LinearIndex {}", path); + n_databases += 1; + continue; + } + + // TODO: load sig, need to change Database + // IDEA: put sig into a LinearIndex, and replace Database with a Box? + } + + if n_signatures > 0 && n_databases > 0 { + info!( + "loaded {} signatures and {} databases total.", + n_signatures, n_databases + ); + } else if n_signatures > 0 { + info!("loaded {} signatures.", n_signatures); + } else if n_databases > 0 { + info!("loaded {} databases.", n_databases); + } else { + return Err(failure::err_msg("Couldn't load any databases")); + } + + Ok(dbs) +} + +struct Results { + similarity: f64, + match_sig: Signature, + db: String, +} + +impl Serialize for Results { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let mut partial = serializer.serialize_struct("Results", 4)?; + partial.serialize_field("similarity", &self.similarity)?; + partial.serialize_field("name", &self.match_sig.name())?; + partial.serialize_field("filename", &self.db)?; + partial.serialize_field("md5", &self.match_sig.md5sum())?; + partial.end() + } +} + +fn search_databases( + query: Query, + databases: &[Database], + threshold: f64, + containment: bool, + best_only: bool, + _ignore_abundance: bool, +) -> Result, Error> { + let mut results = Vec::default(); + + let search_fn = if best_only { + search_minhashes_find_best() + } else if containment { + search_minhashes_containment + } else { + search_minhashes + }; + let query_leaf = query.into(); + + // TODO: set up scaled for DB and query + + for db in databases { + let matches = db.find(search_fn, &query_leaf, threshold).unwrap(); + for dataset in matches.into_iter() { + let similarity = query_leaf.similarity(dataset); + + // should always be true, but... better safe than sorry. + if similarity >= threshold { + results.push(Results { + similarity, + match_sig: dataset.clone().into(), + db: db.path.clone(), + }) + } + } + } + + results.sort_by(|a, b| b.similarity.partial_cmp(&a.similarity).unwrap()); + Ok(results) +} + +fn main() -> Result<(), ExitFailure> { + //setup_panic!(); + + env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init(); + + let yml = load_yaml!("sourmash.yml"); + let m = App::from_yaml(yml).get_matches(); + + match m.subcommand_name() { + Some("draff") => { + let cmd = m.subcommand_matches("draff").unwrap(); + let inputs = cmd + .values_of("inputs") + .map(|vals| vals.collect::>()) + .unwrap(); + + let ksize: usize = cmd.value_of("ksize").unwrap().parse().unwrap(); + let wsize: usize = cmd.value_of("wsize").unwrap().parse().unwrap(); + + draff_signature(inputs, ksize, wsize)?; + } + Some("draff_search") => { + let cmd = m.subcommand_matches("draff_search").unwrap(); + + let index: &str = cmd.value_of("index").unwrap(); + let query: &str = cmd.value_of("query").unwrap(); + + draff_search(index, query)?; + } + Some("draff_compare") => { + let cmd = m.subcommand_matches("draff_compare").unwrap(); + let inputs = cmd + .values_of("inputs") + .map(|vals| vals.collect::>()) + .unwrap(); + + draff_compare(inputs)?; + } + Some("prepare") => { + let cmd = m.subcommand_matches("prepare").unwrap(); + let index: &str = cmd.value_of("index").unwrap(); + + prepare(index)?; + } + Some("index") => { + let cmd = m.subcommand_matches("index").unwrap(); + let inputs = cmd + .values_of("inputs") + .map(|vals| vals.collect::>()) + .unwrap(); + + let output: &str = cmd.value_of("output").unwrap(); + + draff_index(inputs, output)?; + } + Some("scaffold") => { + let cmd = m.subcommand_matches("scaffold").unwrap(); + let sbt_file = cmd.value_of("current_sbt").unwrap(); + + let sbt = MHBT::from_path(sbt_file)?; + let mut new_sbt: MHBT = scaffold(sbt.datasets(), sbt.storage()); + + new_sbt.save_file("test", None)?; + + assert_eq!(new_sbt.datasets().len(), sbt.datasets().len()); + } + Some("count_unique") => { + let cmd = m.subcommand_matches("count_unique").unwrap(); + + let index: &str = cmd.value_of("index").unwrap(); + + count_unique(index)?; + } + Some("search") => { + let cmd = m.subcommand_matches("search").unwrap(); + + if cmd.is_present("quiet") { + log::set_max_level(LevelFilter::Warn); + } + + let query = load_query_signature( + cmd.value_of("query").unwrap(), + if cmd.is_present("ksize") { + cmd.value_of("ksize").unwrap().parse().unwrap() + } else { + // TODO default k + unimplemented!() + }, + Some("dna"), // TODO: select moltype, + if cmd.is_present("scaled") { + Some(cmd.value_of("scaled").unwrap().parse().unwrap()) + } else { + None + }, + )?; + + info!( + "loaded query: {}... (k={}, {})", + query.name(), + query.ksize(), + query.moltype() + ); + + // TODO: check args.scaled and downsample + + let containment = cmd.is_present("containment"); + let traverse_directory = cmd.is_present("traverse-directory"); + let databases = load_sbts_and_sigs( + &cmd.values_of("databases") + .map(|vals| vals.collect::>()) + .unwrap(), + &query, + containment, + traverse_directory, + )?; + + if databases.is_empty() { + return Err(failure::err_msg("Nothing found to search!").into()); + } + + let best_only = cmd.is_present("best-only"); + let threshold = cmd.value_of("threshold").unwrap().parse().unwrap(); + let ignore_abundance = cmd.is_present("ignore-abundance"); + let results = search_databases( + query, + &databases, + threshold, + containment, + best_only, + ignore_abundance, + )?; + + let num_results = if best_only { + 1 + } else { + cmd.value_of("num-results").unwrap().parse().unwrap() + }; + + let n_matches = if num_results == 0 || results.len() <= num_results { + println!("{} matches:", results.len()); + results.len() + } else { + println!("{} matches; showing first {}:", results.len(), num_results); + num_results + }; + + println!("similarity match"); + println!("---------- -----"); + for sr in &results[..n_matches] { + println!( + "{:>5.1}% {:60}", + sr.similarity * 100., + sr.match_sig.name() + ); + } + + if best_only { + info!("** reporting only one match because --best-only was set") + } + + if let Some(output) = cmd.value_of("output") { + let mut wrt = csv::Writer::from_path(output)?; + + for sr in &results[..n_matches] { + wrt.serialize(sr)?; + } + wrt.flush()?; + }; + + if let Some(outname) = cmd.value_of("save-matches") { + let writer = get_output(outname, CompressionFormat::No)?; + + info!("saving all matched signatures to \"{}\"", outname); + + let sigs: Vec = results.into_iter().map(|sr| sr.match_sig).collect(); + serde_json::to_writer(writer, &sigs)?; + } + } + _ => { + println!("{:?}", m); + } + } + Ok(()) +} diff --git a/src/sourmash.yml b/src/bin/sourmash.yml similarity index 89% rename from src/sourmash.yml rename to src/bin/sourmash.yml index 86facfd97..765358e2a 100644 --- a/src/sourmash.yml +++ b/src/bin/sourmash.yml @@ -7,6 +7,14 @@ settings: - SubcommandRequiredElseHelp subcommands: + - count_unique: + about: count unique k-mers in an Index + settings: + - ArgRequiredElseHelp + args: + - index: + help: index to search + required: true - scaffold: about: create SBT scaffold settings: @@ -35,7 +43,7 @@ subcommands: - threshold: long: "threshold" help: minimum threshold for reporting matches - default_value: "0.8" + default_value: "0.08" takes_value: true required: false - save-matches: @@ -93,6 +101,13 @@ subcommands: - databases: help: "signatures/SBTs to search" multiple: true + - prepare: + about: prepare an index + settings: + - ArgRequiredElseHelp + args: + - index: + help: SBT index # groups: # - protein: diff --git a/src/cmd.rs b/src/cmd.rs new file mode 100644 index 000000000..79538ff82 --- /dev/null +++ b/src/cmd.rs @@ -0,0 +1,179 @@ +use std::path::{Path, PathBuf}; +use std::rc::Rc; + +use bio::io::fastx; +use failure::Error; +use log::info; +use ocf::{get_input, get_output, CompressionFormat}; +use pdatastructs::hyperloglog::HyperLogLog; + +use crate::index::linear::LinearIndex; +use crate::index::storage::{FSStorage, Storage}; +use crate::index::{Comparable, Dataset, Index, UKHSTree, MHBT}; +use crate::signature::{Signature, SigsTrait}; +use crate::sketch::ukhs::{FlatUKHS, UKHSTrait, UniqueUKHS}; +use crate::sketch::Sketch; + +pub fn draff_index(sig_files: Vec<&str>, outfile: &str) -> Result<(), Error> { + let storage: Rc = Rc::new( + FSStorage::new(".".into(), ".draff".into()), // TODO: use outfile + ); + + //let mut index = UKHSTree::builder().storage(Rc::clone(&storage)).build(); + let mut index = LinearIndex::>::builder() + .storage(Rc::clone(&storage)) + .build(); + + for filename in sig_files { + // TODO: check for stdin? can also use get_input()? + + let ukhs_sig = FlatUKHS::load(&filename)?; + + let path = Path::new(filename); + let basename = path.file_name().unwrap().to_str().unwrap().to_owned(); + + let sig = Signature::builder() + .hash_function("nthash") // TODO: spec! + .class("draff_signature") // TODO: spec! + .name(Some(basename.to_owned())) + .filename(Some(basename.to_owned())) + .signatures(vec![Sketch::UKHS(ukhs_sig)]) + .build(); + + let dataset = sig.into(); + + index.insert(&dataset)?; + } + + // TODO: implement to_writer and use this? + //let mut output = get_output(outfile, CompressionFormat::No)?; + //index.to_writer(&mut output)? + + index.save_file(outfile, None) +} + +pub fn draff_compare(sigs: Vec<&str>) -> Result<(), Error> { + let mut dists = vec![vec![0.; sigs.len()]; sigs.len()]; + let loaded_sigs: Vec = sigs.iter().map(|s| FlatUKHS::load(s).unwrap()).collect(); + + for (i, sig1) in loaded_sigs.iter().enumerate() { + for (j, sig2) in loaded_sigs.iter().enumerate() { + if i >= j { + dists[i][j] = 1. - sig1.distance(sig2); + dists[j][i] = dists[i][j]; + } + } + } + + for row in dists { + println!("{:.2?}", row); + } + + Ok(()) +} + +pub fn draff_search(index: &str, query: &str) -> Result<(), Error> { + let index = UKHSTree::from_path(index)?; + + let ukhs_sig = FlatUKHS::load(&query)?; + + let sig = Signature::builder() + .hash_function("nthash") // TODO: spec! + .class("draff_signature") // TODO: spec! + .name(Some(query.to_owned())) + .filename(Some(query.to_owned())) + .signatures(vec![Sketch::UKHS(ukhs_sig)]) + .build(); + + let dataset = sig.into(); + + for found in index.search(&dataset, 0.9, false)? { + println!("{:.2}: {:?}", dataset.similarity(found), found); + } + + Ok(()) +} + +pub fn prepare(index_path: &str) -> Result<(), Error> { + let mut index = MHBT::from_path(index_path)?; + + // TODO equivalent to fill_internal in python + //unimplemented!(); + + index.save_file(index_path, None)?; + + Ok(()) +} + +pub fn draff_signature(files: Vec<&str>, k: usize, w: usize) -> Result<(), Error> { + for filename in files { + // TODO: check for stdin? + + let mut ukhs = UniqueUKHS::new(k, w)?; + + info!("Build signature for {} with W={}, K={}...", filename, w, k); + + let (input, _) = get_input(filename)?; + let reader = fastx::Reader::new(input); + + for record in reader.records() { + let record = record?; + + // if there is anything other than ACGT in sequence, + // it is replaced with A. + // This matches khmer and screed behavior + // + // NOTE: sourmash is different! It uses the force flag to drop + // k-mers that are not ACGT + let seq: Vec = record + .seq() + .iter() + .map(|&x| match x as char { + 'A' | 'C' | 'G' | 'T' => x, + 'a' | 'c' | 'g' | 't' => x.to_ascii_uppercase(), + _ => 'A' as u8, + }) + .collect(); + + ukhs.add_sequence(&seq, false)?; + } + + let mut outfile = PathBuf::from(filename); + outfile.set_extension("sig"); + + let mut output = get_output(outfile.to_str().unwrap(), CompressionFormat::No)?; + + let flat: FlatUKHS = ukhs.into(); + flat.to_writer(&mut output)? + } + info!("Done."); + + Ok(()) +} + +pub fn count_unique(index_path: &str) -> Result<(), Error> { + let index = MHBT::from_path(index_path)?; + + info!("Loaded index: {}", index_path); + + let mut hll = HyperLogLog::new(16); + + let mut total_hashes = 0u64; + + for (n, dataset) in index.datasets().iter().enumerate() { + if n % 1000 == 0 { + info!("Processed {} datasets", n); + info!("Unique hashes in {}: {}", index_path, hll.count()); + info!("Total hashes in {}: {}", index_path, total_hashes); + }; + for hash in dataset.mins() { + hll.add(&hash); + total_hashes += 1; + } + } + + info!("Unique k-mers in {}: {}", index_path, hll.count()); + info!("Total hashes in {}: {}", index_path, total_hashes); + + Ok(()) +} diff --git a/src/errors.rs b/src/errors.rs index 521b46566..9ecc68966 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -18,11 +18,17 @@ pub enum SourmashError { #[fail(display = "mismatch in seed; comparison fail")] MismatchSeed, + #[fail(display = "different signatures cannot be compared")] + MismatchSignatureType, + #[fail(display = "invalid DNA character in input k-mer: {}", message)] InvalidDNA { message: String }, #[fail(display = "invalid protein character in input: {}", message)] InvalidProt { message: String }, + + #[fail(display = "Error from deserialization")] + SerdeError, } #[repr(u32)] @@ -39,6 +45,7 @@ pub enum SourmashErrorCode { MismatchDNAProt = 1_02, MismatchMaxHash = 1_03, MismatchSeed = 1_04, + MismatchSignatureType = 1_05, // Input sequence errors InvalidDNA = 11_01, InvalidProt = 11_02, @@ -64,8 +71,12 @@ impl SourmashErrorCode { SourmashError::MismatchDNAProt => SourmashErrorCode::MismatchDNAProt, SourmashError::MismatchMaxHash => SourmashErrorCode::MismatchMaxHash, SourmashError::MismatchSeed => SourmashErrorCode::MismatchSeed, + SourmashError::MismatchSignatureType => { + SourmashErrorCode::MismatchSignatureType + } SourmashError::InvalidDNA { .. } => SourmashErrorCode::InvalidDNA, SourmashError::InvalidProt { .. } => SourmashErrorCode::InvalidProt, + SourmashError::SerdeError => SourmashErrorCode::SerdeError, }; } } diff --git a/src/ffi.rs b/src/ffi.rs deleted file mode 100644 index b1b7c865b..000000000 --- a/src/ffi.rs +++ /dev/null @@ -1,603 +0,0 @@ -use std::ffi::CStr; -use std::io; -use std::mem; -use std::os::raw::c_char; -use std::ptr; -use std::slice; - -use ocf::get_input; -use serde_json; - -use crate::utils::SourmashStr; -use crate::{KmerMinHash, Signature, _hash_murmur}; - -#[no_mangle] -pub extern "C" fn hash_murmur(kmer: *const c_char, seed: u64) -> u64 { - let c_str = unsafe { - assert!(!kmer.is_null()); - - CStr::from_ptr(kmer) - }; - - _hash_murmur(c_str.to_bytes(), seed) -} - -#[no_mangle] -pub unsafe extern "C" fn kmerminhash_new( - n: u32, - k: u32, - prot: bool, - seed: u64, - mx: u64, - track_abundance: bool, -) -> *mut KmerMinHash { - mem::transmute(Box::new(KmerMinHash::new( - n, - k, - prot, - seed, - mx, - track_abundance, - ))) -} - -#[no_mangle] -pub extern "C" fn kmerminhash_free(ptr: *mut KmerMinHash) { - if ptr.is_null() { - return; - } - unsafe { - Box::from_raw(ptr); - } -} - -ffi_fn! { -unsafe fn kmerminhash_add_sequence(ptr: *mut KmerMinHash, sequence: *const c_char, force: bool) -> - Result<()> { - let mh = { - assert!(!ptr.is_null()); - &mut *ptr - }; - let c_str = { - assert!(!sequence.is_null()); - - CStr::from_ptr(sequence) - }; - - mh.add_sequence(c_str.to_bytes(), force) -} -} - -#[no_mangle] -pub extern "C" fn kmerminhash_add_hash(ptr: *mut KmerMinHash, h: u64) { - let mh = unsafe { - assert!(!ptr.is_null()); - &mut *ptr - }; - - mh.add_hash(h); -} - -#[no_mangle] -pub extern "C" fn kmerminhash_add_word(ptr: *mut KmerMinHash, word: *const c_char) { - let mh = unsafe { - assert!(!ptr.is_null()); - &mut *ptr - }; - let c_str = unsafe { - assert!(!word.is_null()); - - CStr::from_ptr(word) - }; - - mh.add_word(c_str.to_bytes()); -} - -ffi_fn! { -unsafe fn kmerminhash_get_mins(ptr: *mut KmerMinHash) -> Result<*const u64> { - let mh = { - assert!(!ptr.is_null()); - &mut *ptr - }; - let output = mh.mins.clone(); - - Ok(Box::into_raw(output.into_boxed_slice()) as *const u64) -} -} - -ffi_fn! { -unsafe fn kmerminhash_get_abunds(ptr: *mut KmerMinHash) -> Result<*const u64> { - let mh = { - assert!(!ptr.is_null()); - &mut *ptr - }; - if let Some(ref abunds) = mh.abunds { - let output = abunds.clone(); - Ok(Box::into_raw(output.into_boxed_slice()) as *const u64) - } else { - Ok(ptr::null()) - } -} -} - -ffi_fn! { -unsafe fn kmerminhash_get_min_idx(ptr: *mut KmerMinHash, idx: u64) -> Result { - let mh = { - assert!(!ptr.is_null()); - &mut *ptr - }; - Ok(mh.mins[idx as usize]) -} -} - -#[no_mangle] -pub extern "C" fn kmerminhash_get_mins_size(ptr: *mut KmerMinHash) -> usize { - let mh = unsafe { - assert!(!ptr.is_null()); - &mut *ptr - }; - mh.mins.len() -} - -#[no_mangle] -pub extern "C" fn kmerminhash_mins_push(ptr: *mut KmerMinHash, val: u64) { - let mh = unsafe { - assert!(!ptr.is_null()); - &mut *ptr - }; - mh.mins.push(val) -} - -ffi_fn! { -unsafe fn kmerminhash_get_abund_idx(ptr: *mut KmerMinHash, idx: u64) -> Result { - let mh = { - assert!(!ptr.is_null()); - &mut *ptr - }; - if let Some(ref mut abunds) = mh.abunds { - Ok(abunds[idx as usize]) - } else { - Ok(0) - } -} -} - -#[no_mangle] -pub extern "C" fn kmerminhash_get_abunds_size(ptr: *mut KmerMinHash) -> usize { - let mh = unsafe { - assert!(!ptr.is_null()); - &mut *ptr - }; - if let Some(ref mut abunds) = mh.abunds { - abunds.len() - } else { - 0 - } -} - -#[no_mangle] -pub extern "C" fn kmerminhash_abunds_push(ptr: *mut KmerMinHash, val: u64) { - let mh = unsafe { - assert!(!ptr.is_null()); - &mut *ptr - }; - if let Some(ref mut abunds) = mh.abunds { - abunds.push(val) - } -} - -#[no_mangle] -pub extern "C" fn kmerminhash_is_protein(ptr: *mut KmerMinHash) -> bool { - let mh = unsafe { - assert!(!ptr.is_null()); - &mut *ptr - }; - mh.is_protein -} - -#[no_mangle] -pub extern "C" fn kmerminhash_seed(ptr: *mut KmerMinHash) -> u64 { - let mh = unsafe { - assert!(!ptr.is_null()); - &mut *ptr - }; - mh.seed -} - -#[no_mangle] -pub extern "C" fn kmerminhash_track_abundance(ptr: *mut KmerMinHash) -> bool { - let mh = unsafe { - assert!(!ptr.is_null()); - &mut *ptr - }; - mh.abunds.is_some() -} - -#[no_mangle] -pub extern "C" fn kmerminhash_num(ptr: *mut KmerMinHash) -> u32 { - let mh = unsafe { - assert!(!ptr.is_null()); - &mut *ptr - }; - mh.num -} - -#[no_mangle] -pub extern "C" fn kmerminhash_ksize(ptr: *mut KmerMinHash) -> u32 { - let mh = unsafe { - assert!(!ptr.is_null()); - &mut *ptr - }; - mh.ksize -} - -#[no_mangle] -pub extern "C" fn kmerminhash_max_hash(ptr: *mut KmerMinHash) -> u64 { - let mh = unsafe { - assert!(!ptr.is_null()); - &mut *ptr - }; - mh.max_hash -} - -ffi_fn! { -unsafe fn kmerminhash_merge(ptr: *mut KmerMinHash, other: *const KmerMinHash) -> Result<()> { - let mh = { - assert!(!ptr.is_null()); - &mut *ptr - }; - let other_mh = { - assert!(!other.is_null()); - &*other - }; - - mh.merge(other_mh)?; - Ok(()) -} -} - -ffi_fn! { -unsafe fn kmerminhash_add_from(ptr: *mut KmerMinHash, other: *const KmerMinHash) - -> Result<()> { - let mh = { - assert!(!ptr.is_null()); - &mut *ptr - }; - let other_mh = { - assert!(!other.is_null()); - &*other - }; - - mh.add_from(other_mh) -} -} - -ffi_fn! { -unsafe fn kmerminhash_count_common(ptr: *mut KmerMinHash, other: *const KmerMinHash) - -> Result { - let mh = { - assert!(!ptr.is_null()); - &mut *ptr - }; - let other_mh = { - assert!(!other.is_null()); - &*other - }; - - mh.count_common(other_mh) -} -} - -ffi_fn! { -unsafe fn kmerminhash_intersection(ptr: *mut KmerMinHash, other: *const KmerMinHash) - -> Result { - let mh = { - assert!(!ptr.is_null()); - &mut *ptr - }; - let other_mh = { - assert!(!other.is_null()); - &*other - }; - - if let Ok((_, size)) = mh.intersection(other_mh) { - return Ok(size); - } - Ok(0) -} -} - -ffi_fn! { -unsafe fn kmerminhash_compare(ptr: *mut KmerMinHash, other: *const KmerMinHash) - -> Result { - let mh = { - assert!(!ptr.is_null()); - &mut *ptr - }; - let other_mh = { - assert!(!other.is_null()); - &*other - }; - - mh.compare(other_mh) -} -} - -// Signature methods - -#[no_mangle] -pub unsafe extern "C" fn signature_new() -> *mut Signature { - mem::transmute(Box::new(Signature::default())) -} - -#[no_mangle] -pub extern "C" fn signature_free(ptr: *mut Signature) { - if ptr.is_null() { - return; - } - unsafe { - Box::from_raw(ptr); - } -} - -ffi_fn! { -unsafe fn signature_set_name(ptr: *mut Signature, name: *const c_char) -> - Result<()> { - let sig = { - assert!(!ptr.is_null()); - &mut *ptr - }; - let c_str = { - assert!(!name.is_null()); - - CStr::from_ptr(name) - }; - - if let Ok(name) = c_str.to_str() { - sig.name = Some(name.to_string()) - } - Ok(()) -} -} - -ffi_fn! { -unsafe fn signature_set_filename(ptr: *mut Signature, name: *const c_char) -> - Result<()> { - let sig = { - assert!(!ptr.is_null()); - &mut *ptr - }; - let c_str = { - assert!(!name.is_null()); - - CStr::from_ptr(name) - }; - - if let Ok(name) = c_str.to_str() { - sig.filename = Some(name.to_string()) - } - Ok(()) -} -} - -ffi_fn! { -unsafe fn signature_push_mh(ptr: *mut Signature, other: *const KmerMinHash) -> - Result<()> { - let sig = { - assert!(!ptr.is_null()); - &mut *ptr - }; - let mh = { - assert!(!other.is_null()); - &*other - }; - - sig.signatures.push(mh.clone()); - Ok(()) -} -} - -ffi_fn! { -unsafe fn signature_set_mh(ptr: *mut Signature, other: *const KmerMinHash) -> - Result<()> { - let sig = { - assert!(!ptr.is_null()); - &mut *ptr - }; - let mh = { - assert!(!other.is_null()); - &*other - }; - - sig.signatures = vec![mh.clone()]; - Ok(()) -} -} - -ffi_fn! { -unsafe fn signature_get_name(ptr: *mut Signature) -> Result { - let sig = { - assert!(!ptr.is_null()); - &mut *ptr - }; - - if let Some(ref name) = sig.name { - Ok(SourmashStr::from_string(name.to_string())) - } else { - Ok(SourmashStr::from_string("".to_string())) - } -} -} - -ffi_fn! { -unsafe fn signature_get_filename(ptr: *mut Signature) -> Result { - let sig = { - assert!(!ptr.is_null()); - &mut *ptr - }; - - if let Some(ref name) = sig.filename { - Ok(SourmashStr::from_string(name.to_string())) - } else { - Ok(SourmashStr::from_string("".to_string())) - } -} -} - -ffi_fn! { -unsafe fn signature_get_license(ptr: *mut Signature) -> Result { - let sig = { - assert!(!ptr.is_null()); - &mut *ptr - }; - - Ok(SourmashStr::from_string(sig.license.to_string())) -} -} - -ffi_fn! { -unsafe fn signature_first_mh(ptr: *mut Signature) -> Result<*mut KmerMinHash> { - let sig = { - assert!(!ptr.is_null()); - &mut *ptr - }; - - if let Some(mh) = sig.signatures.get(0) { - Ok(mem::transmute(Box::new(mh.clone()))) - } else { - // TODO: this is totally wrong - Ok(mem::transmute(Box::new(KmerMinHash::default()))) - } -} -} - -ffi_fn! { -unsafe fn signature_eq(ptr: *mut Signature, other: *mut Signature) -> Result { - let sig = { - assert!(!ptr.is_null()); - &mut *ptr - }; - - let other_sig = { - assert!(!other.is_null()); - &mut *other - }; - - Ok(sig == other_sig) -} -} - -ffi_fn! { -unsafe fn signature_save_json(ptr: *mut Signature) -> Result { - let sig = { - assert!(!ptr.is_null()); - &mut *ptr - }; - - let st = serde_json::to_string(sig)?; - Ok(SourmashStr::from_string(st)) -} -} - -ffi_fn! { -unsafe fn signature_get_mhs(ptr: *mut Signature, size: *mut usize) -> Result<*mut *mut KmerMinHash> { - let sig = { - assert!(!ptr.is_null()); - &mut *ptr - }; - - let output = sig.signatures.clone(); - - let ptr_sigs: Vec<*mut Signature> = output.into_iter().map(|x| { - Box::into_raw(Box::new(x)) as *mut Signature - }).collect(); - - let b = ptr_sigs.into_boxed_slice(); - *size = b.len(); - - Ok(Box::into_raw(b) as *mut *mut KmerMinHash) -} -} - -ffi_fn! { -unsafe fn signatures_save_buffer(ptr: *mut *mut Signature, size: usize) -> Result { - let sigs = { - assert!(!ptr.is_null()); - slice::from_raw_parts(ptr, size) - }; - - let rsigs: Vec<&Signature> = sigs.iter().map(|x| x.as_ref().unwrap()).collect(); - let st = serde_json::to_string(&rsigs)?; - Ok(SourmashStr::from_string(st)) -} -} - -ffi_fn! { -unsafe fn signatures_load_path(ptr: *const c_char, - ignore_md5sum: bool, - ksize: usize, - select_moltype: *const c_char, - size: *mut usize) -> Result<*mut *mut Signature> { - let buf = { - assert!(!ptr.is_null()); - CStr::from_ptr(ptr) - }; - - let moltype = { - if select_moltype.is_null() { - None - } else { - Some(CStr::from_ptr(select_moltype).to_str()?) - } - }; - - // TODO: implement ignore_md5sum - - let (mut input, _) = get_input(buf.to_str()?)?; - let filtered_sigs = Signature::load_signatures(&mut input, ksize, moltype, None)?; - - let ptr_sigs: Vec<*mut Signature> = filtered_sigs.into_iter().map(|x| { - Box::into_raw(Box::new(x)) as *mut Signature - }).collect(); - - let b = ptr_sigs.into_boxed_slice(); - *size = b.len(); - - Ok(Box::into_raw(b) as *mut *mut Signature) -} -} -ffi_fn! { -unsafe fn signatures_load_buffer(ptr: *const c_char, - insize: usize, - ignore_md5sum: bool, - ksize: usize, - select_moltype: *const c_char, - size: *mut usize) -> Result<*mut *mut Signature> { - let buf = { - assert!(!ptr.is_null()); - slice::from_raw_parts(ptr as *mut u8, insize) - }; - - let moltype = { - if select_moltype.is_null() { - None - } else { - Some(CStr::from_ptr(select_moltype).to_str()?) - } - }; - - // TODO: implement ignore_md5sum - - let mut reader = io::BufReader::new(buf); - let filtered_sigs = Signature::load_signatures(&mut reader, ksize, moltype, None)?; - - let ptr_sigs: Vec<*mut Signature> = filtered_sigs.into_iter().map(|x| { - Box::into_raw(Box::new(x)) as *mut Signature - }).collect(); - - let b = ptr_sigs.into_boxed_slice(); - *size = b.len(); - - Ok(Box::into_raw(b) as *mut *mut Signature) -} -} diff --git a/src/ffi/minhash.rs b/src/ffi/minhash.rs new file mode 100644 index 000000000..2f278fa65 --- /dev/null +++ b/src/ffi/minhash.rs @@ -0,0 +1,336 @@ +use std::ffi::CStr; +use std::mem; +use std::os::raw::c_char; +use std::ptr; +use std::slice; + +use crate::signature::SigsTrait; +use crate::sketch::minhash::KmerMinHash; + +#[no_mangle] +pub unsafe extern "C" fn kmerminhash_new( + n: u32, + k: u32, + prot: bool, + seed: u64, + mx: u64, + track_abundance: bool, +) -> *mut KmerMinHash { + mem::transmute(Box::new(KmerMinHash::new( + n, + k, + prot, + seed, + mx, + track_abundance, + ))) +} + +#[no_mangle] +pub unsafe extern "C" fn kmerminhash_free(ptr: *mut KmerMinHash) { + if ptr.is_null() { + return; + } + Box::from_raw(ptr); +} + +ffi_fn! { +unsafe fn kmerminhash_add_sequence(ptr: *mut KmerMinHash, sequence: *const c_char, force: bool) -> + Result<()> { + let mh = { + assert!(!ptr.is_null()); + &mut *ptr + }; + let c_str = { + assert!(!sequence.is_null()); + + CStr::from_ptr(sequence) + }; + + mh.add_sequence(c_str.to_bytes(), force) +} +} + +#[no_mangle] +pub unsafe extern "C" fn kmerminhash_add_hash(ptr: *mut KmerMinHash, h: u64) { + let mh = { + assert!(!ptr.is_null()); + &mut *ptr + }; + + mh.add_hash(h); +} + +#[no_mangle] +pub unsafe extern "C" fn kmerminhash_add_word(ptr: *mut KmerMinHash, word: *const c_char) { + let mh = { + assert!(!ptr.is_null()); + &mut *ptr + }; + let c_str = { + assert!(!word.is_null()); + + CStr::from_ptr(word) + }; + + mh.add_word(c_str.to_bytes()); +} + +#[no_mangle] +pub extern "C" fn kmerminhash_remove_hash(ptr: *mut KmerMinHash, h: u64) { + let mh = unsafe { + assert!(!ptr.is_null()); + &mut *ptr + }; + + mh.remove_hash(h); +} + +#[no_mangle] +pub extern "C" fn kmerminhash_remove_many( + ptr: *mut KmerMinHash, + hashes_ptr: *const u64, + insize: usize, +) { + let mh = unsafe { + assert!(!ptr.is_null()); + &mut *ptr + }; + + let hashes = unsafe { + assert!(!hashes_ptr.is_null()); + slice::from_raw_parts(hashes_ptr as *mut u64, insize) + }; + + mh.remove_many(hashes).expect("Hash removal error"); +} + +ffi_fn! { +unsafe fn kmerminhash_get_mins(ptr: *mut KmerMinHash) -> Result<*const u64> { + let mh = { + assert!(!ptr.is_null()); + &mut *ptr + }; + let output = mh.mins.clone(); + + Ok(Box::into_raw(output.into_boxed_slice()) as *const u64) +} +} + +ffi_fn! { +unsafe fn kmerminhash_get_abunds(ptr: *mut KmerMinHash) -> Result<*const u64> { + let mh = { + assert!(!ptr.is_null()); + &mut *ptr + }; + if let Some(ref abunds) = mh.abunds { + let output = abunds.clone(); + Ok(Box::into_raw(output.into_boxed_slice()) as *const u64) + } else { + Ok(ptr::null()) + } +} +} + +ffi_fn! { +unsafe fn kmerminhash_get_min_idx(ptr: *mut KmerMinHash, idx: u64) -> Result { + let mh = { + assert!(!ptr.is_null()); + &mut *ptr + }; + Ok(mh.mins[idx as usize]) +} +} + +#[no_mangle] +pub unsafe extern "C" fn kmerminhash_get_mins_size(ptr: *mut KmerMinHash) -> usize { + let mh = { + assert!(!ptr.is_null()); + &mut *ptr + }; + mh.mins.len() +} + +#[no_mangle] +pub unsafe extern "C" fn kmerminhash_mins_push(ptr: *mut KmerMinHash, val: u64) { + let mh = { + assert!(!ptr.is_null()); + &mut *ptr + }; + mh.mins.push(val) +} + +ffi_fn! { +unsafe fn kmerminhash_get_abund_idx(ptr: *mut KmerMinHash, idx: u64) -> Result { + let mh = { + assert!(!ptr.is_null()); + &mut *ptr + }; + if let Some(ref mut abunds) = mh.abunds { + Ok(abunds[idx as usize]) + } else { + Ok(0) + } +} +} + +#[no_mangle] +pub unsafe extern "C" fn kmerminhash_get_abunds_size(ptr: *mut KmerMinHash) -> usize { + let mh = { + assert!(!ptr.is_null()); + &mut *ptr + }; + if let Some(ref mut abunds) = mh.abunds { + abunds.len() + } else { + 0 + } +} + +#[no_mangle] +pub unsafe extern "C" fn kmerminhash_abunds_push(ptr: *mut KmerMinHash, val: u64) { + let mh = { + assert!(!ptr.is_null()); + &mut *ptr + }; + if let Some(ref mut abunds) = mh.abunds { + abunds.push(val) + } +} + +#[no_mangle] +pub unsafe extern "C" fn kmerminhash_is_protein(ptr: *mut KmerMinHash) -> bool { + let mh = { + assert!(!ptr.is_null()); + &mut *ptr + }; + mh.is_protein() +} + +#[no_mangle] +pub unsafe extern "C" fn kmerminhash_seed(ptr: *mut KmerMinHash) -> u64 { + let mh = { + assert!(!ptr.is_null()); + &mut *ptr + }; + mh.seed() +} + +#[no_mangle] +pub unsafe extern "C" fn kmerminhash_track_abundance(ptr: *mut KmerMinHash) -> bool { + let mh = { + assert!(!ptr.is_null()); + &mut *ptr + }; + mh.abunds.is_some() +} + +#[no_mangle] +pub unsafe extern "C" fn kmerminhash_num(ptr: *mut KmerMinHash) -> u32 { + let mh = { + assert!(!ptr.is_null()); + &mut *ptr + }; + mh.num() +} + +#[no_mangle] +pub unsafe extern "C" fn kmerminhash_ksize(ptr: *mut KmerMinHash) -> u32 { + let mh = { + assert!(!ptr.is_null()); + &mut *ptr + }; + mh.ksize() as u32 +} + +#[no_mangle] +pub unsafe extern "C" fn kmerminhash_max_hash(ptr: *mut KmerMinHash) -> u64 { + let mh = { + assert!(!ptr.is_null()); + &mut *ptr + }; + mh.max_hash() +} + +ffi_fn! { +unsafe fn kmerminhash_merge(ptr: *mut KmerMinHash, other: *const KmerMinHash) -> Result<()> { + let mh = { + assert!(!ptr.is_null()); + &mut *ptr + }; + let other_mh = { + assert!(!other.is_null()); + &*other + }; + + mh.merge(other_mh)?; + Ok(()) +} +} + +ffi_fn! { +unsafe fn kmerminhash_add_from(ptr: *mut KmerMinHash, other: *const KmerMinHash) + -> Result<()> { + let mh = { + assert!(!ptr.is_null()); + &mut *ptr + }; + let other_mh = { + assert!(!other.is_null()); + &*other + }; + + mh.add_from(other_mh) +} +} + +ffi_fn! { +unsafe fn kmerminhash_count_common(ptr: *mut KmerMinHash, other: *const KmerMinHash) + -> Result { + let mh = { + assert!(!ptr.is_null()); + &mut *ptr + }; + let other_mh = { + assert!(!other.is_null()); + &*other + }; + + mh.count_common(other_mh) +} +} + +ffi_fn! { +unsafe fn kmerminhash_intersection(ptr: *mut KmerMinHash, other: *const KmerMinHash) + -> Result { + let mh = { + assert!(!ptr.is_null()); + &mut *ptr + }; + let other_mh = { + assert!(!other.is_null()); + &*other + }; + + if let Ok((_, size)) = mh.intersection(other_mh) { + return Ok(size); + } + Ok(0) +} +} + +ffi_fn! { +unsafe fn kmerminhash_compare(ptr: *mut KmerMinHash, other: *const KmerMinHash) + -> Result { + let mh = { + assert!(!ptr.is_null()); + &mut *ptr + }; + let other_mh = { + assert!(!other.is_null()); + &*other + }; + + mh.compare(other_mh) +} +} diff --git a/src/ffi/mod.rs b/src/ffi/mod.rs new file mode 100644 index 000000000..9c1d5a982 --- /dev/null +++ b/src/ffi/mod.rs @@ -0,0 +1,22 @@ +//! # Foreign Function Interface for calling sourmash from a C API +//! +//! Primary client for now is the Python version, using CFFI and milksnake. + +pub mod minhash; +pub mod signature; + +use std::ffi::CStr; +use std::os::raw::c_char; + +use crate::_hash_murmur; + +#[no_mangle] +pub unsafe extern "C" fn hash_murmur(kmer: *const c_char, seed: u64) -> u64 { + let c_str = { + assert!(!kmer.is_null()); + + CStr::from_ptr(kmer) + }; + + _hash_murmur(c_str.to_bytes(), seed) +} diff --git a/src/ffi/signature.rs b/src/ffi/signature.rs new file mode 100644 index 000000000..ce69a1ddb --- /dev/null +++ b/src/ffi/signature.rs @@ -0,0 +1,289 @@ +use std::ffi::CStr; +use std::io; +use std::mem; +use std::os::raw::c_char; +use std::slice; + +use ocf::get_input; +use serde_json; + +use crate::signature::Signature; +use crate::sketch::minhash::KmerMinHash; +use crate::sketch::Sketch; +use crate::utils::SourmashStr; + +// Signature methods + +#[no_mangle] +pub unsafe extern "C" fn signature_new() -> *mut Signature { + mem::transmute(Box::new(Signature::default())) +} + +#[no_mangle] +pub unsafe extern "C" fn signature_free(ptr: *mut Signature) { + if ptr.is_null() { + return; + } + Box::from_raw(ptr); +} + +ffi_fn! { +unsafe fn signature_set_name(ptr: *mut Signature, name: *const c_char) -> + Result<()> { + let sig = { + assert!(!ptr.is_null()); + &mut *ptr + }; + let c_str = { + assert!(!name.is_null()); + + CStr::from_ptr(name) + }; + + if let Ok(name) = c_str.to_str() { + sig.name = Some(name.to_string()) + } + Ok(()) +} +} + +ffi_fn! { +unsafe fn signature_set_filename(ptr: *mut Signature, name: *const c_char) -> + Result<()> { + let sig = { + assert!(!ptr.is_null()); + &mut *ptr + }; + let c_str = { + assert!(!name.is_null()); + + CStr::from_ptr(name) + }; + + if let Ok(name) = c_str.to_str() { + sig.filename = Some(name.to_string()) + } + Ok(()) +} +} + +ffi_fn! { +unsafe fn signature_push_mh(ptr: *mut Signature, other: *const KmerMinHash) -> + Result<()> { + let sig = { + assert!(!ptr.is_null()); + &mut *ptr + }; + let mh = { + assert!(!other.is_null()); + &*other + }; + + sig.signatures.push(Sketch::MinHash(mh.clone())); + Ok(()) +} +} + +ffi_fn! { +unsafe fn signature_set_mh(ptr: *mut Signature, other: *const KmerMinHash) -> + Result<()> { + let sig = { + assert!(!ptr.is_null()); + &mut *ptr + }; + let mh = { + assert!(!other.is_null()); + &*other + }; + + sig.signatures = vec![Sketch::MinHash(mh.clone())]; + Ok(()) +} +} + +ffi_fn! { +unsafe fn signature_get_name(ptr: *mut Signature) -> Result { + let sig = { + assert!(!ptr.is_null()); + &mut *ptr + }; + + if let Some(ref name) = sig.name { + Ok(SourmashStr::from_string(name.to_string())) + } else { + Ok(SourmashStr::from_string("".to_string())) + } +} +} + +ffi_fn! { +unsafe fn signature_get_filename(ptr: *mut Signature) -> Result { + let sig = { + assert!(!ptr.is_null()); + &mut *ptr + }; + + if let Some(ref name) = sig.filename { + Ok(SourmashStr::from_string(name.to_string())) + } else { + Ok(SourmashStr::from_string("".to_string())) + } +} +} + +ffi_fn! { +unsafe fn signature_get_license(ptr: *mut Signature) -> Result { + let sig = { + assert!(!ptr.is_null()); + &mut *ptr + }; + + Ok(SourmashStr::from_string(sig.license.to_string())) +} +} + +ffi_fn! { +unsafe fn signature_first_mh(ptr: *mut Signature) -> Result<*mut KmerMinHash> { + let sig = { + assert!(!ptr.is_null()); + &mut *ptr + }; + + if let Some(mh) = sig.signatures.get(0) { + Ok(mem::transmute(Box::new(mh.clone()))) + } else { + unimplemented!() + } +} +} + +ffi_fn! { +unsafe fn signature_eq(ptr: *mut Signature, other: *mut Signature) -> Result { + let sig = { + assert!(!ptr.is_null()); + &mut *ptr + }; + + let other_sig = { + assert!(!other.is_null()); + &mut *other + }; + + Ok(sig == other_sig) +} +} + +ffi_fn! { +unsafe fn signature_save_json(ptr: *mut Signature) -> Result { + let sig = { + assert!(!ptr.is_null()); + &mut *ptr + }; + + let st = serde_json::to_string(sig)?; + Ok(SourmashStr::from_string(st)) +} +} + +ffi_fn! { +unsafe fn signature_get_mhs(ptr: *mut Signature, size: *mut usize) -> Result<*mut *mut KmerMinHash> { + let sig = { + assert!(!ptr.is_null()); + &mut *ptr + }; + + let output = sig.signatures.clone(); + + let ptr_sigs: Vec<*mut Signature> = output.into_iter().map(|x| { + Box::into_raw(Box::new(x)) as *mut Signature + }).collect(); + + let b = ptr_sigs.into_boxed_slice(); + *size = b.len(); + + Ok(Box::into_raw(b) as *mut *mut KmerMinHash) +} +} + +ffi_fn! { +unsafe fn signatures_save_buffer(ptr: *mut *mut Signature, size: usize) -> Result { + let sigs = { + assert!(!ptr.is_null()); + slice::from_raw_parts(ptr, size) + }; + + let rsigs: Vec<&Signature> = sigs.iter().map(|x| x.as_ref().unwrap()).collect(); + let st = serde_json::to_string(&rsigs)?; + Ok(SourmashStr::from_string(st)) +} +} + +ffi_fn! { +unsafe fn signatures_load_path(ptr: *const c_char, + _ignore_md5sum: bool, + ksize: usize, + select_moltype: *const c_char, + size: *mut usize) -> Result<*mut *mut Signature> { + let buf = { + assert!(!ptr.is_null()); + CStr::from_ptr(ptr) + }; + + let moltype = { + if select_moltype.is_null() { + None + } else { + Some(CStr::from_ptr(select_moltype).to_str()?) + } + }; + + // TODO: implement ignore_md5sum + + let (mut input, _) = get_input(buf.to_str()?)?; + let filtered_sigs = Signature::load_signatures(&mut input, ksize, moltype, None)?; + + let ptr_sigs: Vec<*mut Signature> = filtered_sigs.into_iter().map(|x| { + Box::into_raw(Box::new(x)) as *mut Signature + }).collect(); + + let b = ptr_sigs.into_boxed_slice(); + *size = b.len(); + + Ok(Box::into_raw(b) as *mut *mut Signature) +} +} +ffi_fn! { +unsafe fn signatures_load_buffer(ptr: *const c_char, + insize: usize, + _ignore_md5sum: bool, + ksize: usize, + select_moltype: *const c_char, + size: *mut usize) -> Result<*mut *mut Signature> { + let buf = { + assert!(!ptr.is_null()); + slice::from_raw_parts(ptr as *mut u8, insize) + }; + + let moltype = { + if select_moltype.is_null() { + None + } else { + Some(CStr::from_ptr(select_moltype).to_str()?) + } + }; + + // TODO: implement ignore_md5sum + + let mut reader = io::BufReader::new(buf); + let filtered_sigs = Signature::load_signatures(&mut reader, ksize, moltype, None)?; + + let ptr_sigs: Vec<*mut Signature> = filtered_sigs.into_iter().map(|x| { + Box::into_raw(Box::new(x)) as *mut Signature + }).collect(); + + let b = ptr_sigs.into_boxed_slice(); + *size = b.len(); + + Ok(Box::into_raw(b) as *mut *mut Signature) +} +} diff --git a/src/from.rs b/src/from.rs index 142090747..e0846a11d 100644 --- a/src/from.rs +++ b/src/from.rs @@ -1,6 +1,6 @@ use finch::minhashes::MinHashKmers; -use KmerMinHash; +use crate::signatures::minhash::KmerMinHash; impl From for KmerMinHash { fn from(other: MinHashKmers) -> KmerMinHash { @@ -15,13 +15,96 @@ impl From for KmerMinHash { true, ); - let hash_with_abunds = values + let hash_with_abunds: Vec<(u64, u64)> = values .iter() .map(|x| (x.hash as u64, x.count as u64)) .collect(); - new_mh.add_many_with_abund(hash_with_abunds); + new_mh.add_many_with_abund(&hash_with_abunds); new_mh } } + +#[cfg(test)] +mod test { + use std::collections::HashMap; + use std::collections::HashSet; + use std::iter::FromIterator; + + use crate::signatures::minhash::KmerMinHash; + + use finch::minhashes::MinHashKmers; + use needletail::kmer::canonical; + + use super::*; + + #[test] + fn finch_behavior() { + let mut a = KmerMinHash::new(20, 10, false, 42, 0, true); + let mut b = MinHashKmers::new(20, 42); + + let seq = b"TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA"; + + a.add_sequence(seq, false); + + for kmer in seq.windows(10) { + b.push(&canonical(kmer), 0); + } + + let b_hashes = b.into_vec(); + + let s1: HashSet<_> = HashSet::from_iter(a.mins.iter().map(|x| *x)); + let s2: HashSet<_> = HashSet::from_iter(b_hashes.iter().map(|x| x.hash as u64)); + let i1 = &s1 & &s2; + + assert!(i1.len() == a.mins.len()); + assert!(i1.len() == b_hashes.len()); + + if let Some(abunds) = a.abunds { + let smap: HashMap<_, _> = HashMap::from_iter(a.mins.iter().zip(abunds.iter())); + println!("{:?}", smap); + for item in b_hashes.iter() { + assert!(smap.contains_key(&(item.hash as u64))); + assert!( + **smap.get(&(item.hash as u64)).unwrap() + == ((item.count + item.extra_count) as u64) + ); + } + } + } + + #[test] + fn from_finch() { + let mut a = KmerMinHash::new(20, 10, false, 42, 0, true); + let mut b = MinHashKmers::new(20, 42); + + let seq = b"TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA"; + + a.add_sequence(seq, false); + + for kmer in seq.windows(10) { + b.push(&canonical(kmer), 0); + } + + let c = KmerMinHash::from(b); + + let s1: HashSet<_> = HashSet::from_iter(a.mins.iter().map(|x| *x)); + let s2: HashSet<_> = HashSet::from_iter(c.mins.iter().map(|x| *x)); + let i1 = &s1 & &s2; + + assert!(i1.len() == a.mins.len()); + assert!(i1.len() == c.mins.len()); + + if let Some(a_abunds) = a.abunds { + if let Some(c_abunds) = c.abunds { + let a_smap: HashMap<_, _> = HashMap::from_iter(a.mins.iter().zip(a_abunds.iter())); + let c_smap: HashMap<_, _> = HashMap::from_iter(c.mins.iter().zip(c_abunds.iter())); + for item in a_smap.iter() { + assert!(c_smap.contains_key(*item.0)); + assert!(c_smap.get(*item.0).unwrap() == item.1); + } + } + } + } +} diff --git a/src/index.rs b/src/index.rs deleted file mode 100644 index de5a9e3ee..000000000 --- a/src/index.rs +++ /dev/null @@ -1,161 +0,0 @@ -pub mod sbt; - -pub mod storage; - -pub mod nodegraph; - -pub mod linear; - -pub mod search; - -use std::path::Path; -use std::rc::Rc; - -use serde_derive::Deserialize; - -use derive_builder::Builder; -use failure::Error; -use lazy_init::Lazy; - -use crate::index::storage::{ReadData, Storage}; -use crate::Signature; - -pub trait Index { - type Item; - - fn find( - &self, - search_fn: F, - sig: &Self::Item, - threshold: f64, - ) -> Result, Error> - where - F: Fn(&dyn Comparable, &Self::Item, f64) -> bool; - - fn insert(&mut self, node: &Self::Item); - - fn save>(&self, path: P) -> Result<(), Error>; - - fn load>(path: P) -> Result<(), Error>; -} - -// TODO: split into two traits, Similarity and Containment? -pub trait Comparable { - fn similarity(&self, other: &O) -> f64; - fn containment(&self, other: &O) -> f64; -} - -impl<'a, N, L> Comparable for &'a N -where - N: Comparable, -{ - fn similarity(&self, other: &L) -> f64 { - (*self).similarity(&other) - } - - fn containment(&self, other: &L) -> f64 { - (*self).containment(&other) - } -} - -#[derive(Deserialize)] -pub struct LeafInfo { - pub filename: String, - pub name: String, - pub metadata: String, -} - -#[derive(Builder, Default, Clone)] -pub struct Leaf -where - T: std::marker::Sync, -{ - pub(crate) filename: String, - pub(crate) name: String, - pub(crate) metadata: String, - - pub(crate) storage: Option>, - - pub(crate) data: Rc>, -} - -impl std::fmt::Debug for Leaf -where - T: std::marker::Sync, -{ - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "Leaf [filename: {}, name: {}, metadata: {}]", - self.filename, self.name, self.metadata - ) - } -} - -impl ReadData for Leaf { - fn data(&self, storage: &S) -> Result<&Signature, Error> { - let sig = self.data.get_or_create(|| { - let raw = storage.load(&self.filename).unwrap(); - let sigs: Vec = serde_json::from_reader(&mut &raw[..]).unwrap(); - // TODO: select the right sig? - sigs[0].to_owned() - }); - - Ok(sig) - } -} - -impl Leaf { - pub fn count_common(&self, other: &Leaf) -> u64 { - if let Some(storage) = &self.storage { - let ng: &Signature = self.data(&**storage).unwrap(); - let ong: &Signature = other.data(&**storage).unwrap(); - - // TODO: select the right signatures... - ng.signatures[0].count_common(&ong.signatures[0]).unwrap() as u64 - } else { - 0 - } - } - - pub fn mins(&self) -> Vec { - if let Some(storage) = &self.storage { - let ng: &Signature = self.data(&**storage).unwrap(); - ng.signatures[0].mins.to_vec() - } else { - Vec::new() - } - } -} - -impl Comparable> for Leaf { - fn similarity(&self, other: &Leaf) -> f64 { - if let Some(storage) = &self.storage { - let ng: &Signature = self.data(&**storage).unwrap(); - let ong: &Signature = other.data(&**storage).unwrap(); - - // TODO: select the right signatures... - ng.signatures[0].compare(&ong.signatures[0]).unwrap() - } else { - // TODO: in this case storage is not set up, - // so we should throw an error? - 0.0 - } - } - - fn containment(&self, other: &Leaf) -> f64 { - if let Some(storage) = &self.storage { - let ng: &Signature = self.data(&**storage).unwrap(); - let ong: &Signature = other.data(&**storage).unwrap(); - - // TODO: select the right signatures... - let common = ng.signatures[0].count_common(&ong.signatures[0]).unwrap(); - let size = ng.signatures[0].mins.len(); - common as f64 / size as f64 - } else { - // TODO: in this case storage is not set up, - // so we should throw an error? - 0.0 - } - } -} diff --git a/src/index/bigsi.rs b/src/index/bigsi.rs new file mode 100644 index 000000000..07f67304d --- /dev/null +++ b/src/index/bigsi.rs @@ -0,0 +1,228 @@ +use std::collections::HashMap; +use std::path::Path; + +use failure::{Error, Fail}; +use fixedbitset::FixedBitSet; +use typed_builder::TypedBuilder; + +use crate::index::{Comparable, Index}; +use crate::signature::{Signature, SigsTrait}; +use crate::sketch::nodegraph::Nodegraph; +use crate::sketch::Sketch; +use crate::HashIntoType; + +#[derive(Clone, TypedBuilder)] +pub struct BIGSI { + matrix: Vec, + ksize: usize, + datasets: Vec, + //#[builder(setter(skip))] + //storage: Rc, +} + +#[derive(Debug, Fail)] +pub enum BIGSIError { + #[fail(display = "BIGSI doesn't support this method")] + MethodDisabled, +} + +impl BIGSI { + pub fn new(bf_size: usize, ksize: usize) -> BIGSI { + let mut matrix = Vec::with_capacity(bf_size); + for _ in 0..bf_size { + // TODO: figure initial capacity for each row + matrix.push(FixedBitSet::with_capacity(100)); + } + + BIGSI { + matrix, + ksize, + datasets: Vec::new(), + } + } +} + +impl BIGSI { + pub fn add(&mut self, dataset: Signature) { + let mut ng = Nodegraph::new(&[self.matrix.len()], self.ksize); + + // TODO: select correct minhash + if let Sketch::MinHash(mh) = &dataset.signatures[0] { + for h in &mh.mins { + ng.count(*h); + } + } else { + // TODO: what if it is not a mh? + unimplemented!() + } + + self.datasets.push(dataset); + let col = self.datasets.len() - 1; + + for pos in ng.bs[0].ones() { + let bs = &mut self.matrix[pos]; + if bs.len() == col { + bs.grow(col + col / 2); + } + bs.insert(col); + } + } + + pub fn query(&self, hash: HashIntoType) -> impl Iterator + '_ { + let pos = hash as usize % self.matrix.len(); + let bs = &self.matrix[pos]; + bs.ones() + } + + pub fn query_datasets(&self, hash: HashIntoType) -> impl Iterator + '_ { + self.query(hash).map(move |pos| self.datasets[pos].clone()) + } +} + +impl Index for BIGSI { + type Item = Signature; + + fn find( + &self, + _search_fn: F, + _sig: &Self::Item, + _threshold: f64, + ) -> Result, Error> + where + F: Fn(&dyn Comparable, &Self::Item, f64) -> bool, + { + // TODO: is there a better way than making this a runtime check? + //Err(BIGSIError::MethodDisabled.into()) + unimplemented!(); + } + + fn search( + &self, + sig: &Self::Item, + threshold: f64, + containment: bool, + ) -> Result, Error> { + let mut results = Vec::new(); + + //TODO: still assuming one mh in the signature! + if let Sketch::MinHash(hashes) = &sig.signatures[0] { + let mut counter: HashMap = HashMap::with_capacity(hashes.size()); + + for hash in &hashes.mins { + self.query(*hash) + .map(|dataset_idx| { + let idx = counter.entry(dataset_idx).or_insert(0); + *idx += 1; + }) + .count(); + } + + for (idx, count) in counter { + let match_sig = &self.datasets[idx]; + //TODO: still assuming one mh in the signature! + let match_mh = match_sig.signatures[0].size(); + + let score = if containment { + count as f64 / hashes.size() as f64 + } else { + count as f64 / (hashes.size() + match_mh - count) as f64 + }; + + if score >= threshold { + results.push(match_sig) + } + } + + Ok(results) + } else { + // TODO: what if it is not a minhash? + unimplemented!() + } + } + + fn insert(&mut self, node: &Self::Item) -> Result<(), Error> { + self.add(node.clone()); + Ok(()) + } + + fn save>(&self, _path: P) -> Result<(), Error> { + unimplemented!() + } + + fn load>(_path: P) -> Result<(), Error> { + unimplemented!() + } + + fn datasets(&self) -> Vec { + unimplemented!() + } +} + +#[cfg(test)] +mod test { + use std::fs::File; + use std::io::BufReader; + use std::path::PathBuf; + use std::rc::Rc; + + use lazy_init::Lazy; + + use super::BIGSI; + + use crate::index::storage::ReadData; + use crate::index::Dataset; + use crate::index::{Index, MHBT}; + use crate::signature::Signature; + + #[test] + fn bigsi_sbt_oracle() { + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("tests/test-data/v5.sbt.json"); + + let sbt = MHBT::from_path(filename).expect("Loading error"); + + let mut bigsi = BIGSI::new(10000, 10); + let datasets = sbt.datasets(); + + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("tests/test-data/.sbt.v3/60f7e23c24a8d94791cc7a8680c493f9"); + + let mut reader = BufReader::new(File::open(filename).unwrap()); + let sigs = Signature::load_signatures(&mut reader, 31, Some("DNA".into()), None).unwrap(); + let sig_data = sigs[0].clone(); + + let data = Lazy::new(); + data.get_or_create(|| sig_data); + + let leaf = Dataset::builder() + .data(Rc::new(data)) + .filename("") + .name("") + .metadata("") + .storage(None) + .build(); + + for l in &datasets { + let data = l.data().unwrap(); + bigsi.insert(data).expect("insertion error!"); + } + + let results_sbt = sbt.search(&leaf, 0.5, false).unwrap(); + assert_eq!(results_sbt.len(), 1); + + let data = (*leaf.data).get().unwrap(); + let results_bigsi = bigsi.search(&data, 0.5, false).unwrap(); + assert_eq!(results_bigsi.len(), 1); + + assert_eq!(results_sbt.len(), results_bigsi.len()); + + let results_sbt = sbt.search(&leaf, 0.1, false).unwrap(); + assert_eq!(results_sbt.len(), 2); + + let data = (*leaf.data).get().unwrap(); + let results_bigsi = bigsi.search(&data, 0.1, false).unwrap(); + assert_eq!(results_bigsi.len(), 2); + + assert_eq!(results_sbt.len(), results_bigsi.len()); + } +} diff --git a/src/index/linear.rs b/src/index/linear.rs index 29ecb6699..6def3bd82 100644 --- a/src/index/linear.rs +++ b/src/index/linear.rs @@ -1,19 +1,32 @@ +use std::fs::File; +use std::io::{BufReader, Read}; +use std::mem; use std::path::Path; +use std::path::PathBuf; use std::rc::Rc; -use derive_builder::Builder; use failure::Error; +use lazy_init::Lazy; +use serde_derive::{Deserialize, Serialize}; +use typed_builder::TypedBuilder; -use crate::index::storage::Storage; -use crate::index::{Comparable, Index}; +use crate::index::storage::{FSStorage, ReadData, Storage, StorageInfo, ToWriter}; +use crate::index::{Comparable, Dataset, DatasetInfo, Index}; -#[derive(Builder)] +#[derive(TypedBuilder)] pub struct LinearIndex { - //#[builder(setter(skip))] - storage: Rc, + #[builder(default)] + storage: Option>, - #[builder(setter(skip))] - pub(crate) leaves: Vec, + #[builder(default)] + pub(crate) datasets: Vec, +} + +#[derive(Serialize, Deserialize)] +struct LinearInfo { + version: u32, + storage: StorageInfo, + leaves: Vec, } impl Index for LinearIndex @@ -32,7 +45,7 @@ where F: Fn(&dyn Comparable, &Self::Item, f64) -> bool, { Ok(self - .leaves + .datasets .iter() .flat_map(|node| { if search_fn(node, sig, threshold) { @@ -44,15 +57,141 @@ where .collect()) } - fn insert(&mut self, node: &L) { - self.leaves.push(node.clone()); + fn insert(&mut self, node: &L) -> Result<(), Error> { + self.datasets.push(node.clone()); + Ok(()) } - fn save>(&self, path: P) -> Result<(), Error> { - Ok(()) + fn save>(&self, _path: P) -> Result<(), Error> { + /* + let file = File::create(path)?; + match serde_json::to_writer(file, &self) { + Ok(_) => Ok(()), + Err(_) => Err(SourmashError::SerdeError.into()), + } + */ + unimplemented!() + } + + fn load>(_path: P) -> Result<(), Error> { + unimplemented!() + } + + fn datasets(&self) -> Vec { + self.datasets.to_vec() } +} + +impl LinearIndex> +where + L: std::marker::Sync + ToWriter, + Dataset: ReadData, +{ + pub fn save_file>( + &mut self, + path: P, + storage: Option>, + ) -> Result<(), Error> { + let ref_path = path.as_ref(); + let mut basename = ref_path.file_name().unwrap().to_str().unwrap().to_owned(); + if basename.ends_with(".sbt.json") { + basename = basename.replace(".sbt.json", ""); + } + let location = ref_path.parent().unwrap(); + + let storage = match storage { + Some(s) => s, + None => { + let subdir = format!(".linear.{}", basename); + Rc::new(FSStorage::new(location.to_str().unwrap(), &subdir)) + } + }; + + let args = storage.args(); + let storage_info = StorageInfo { + backend: "FSStorage".into(), + args, + }; + + let info: LinearInfo = LinearInfo { + storage: storage_info, + version: 5, + leaves: self + .datasets + .iter_mut() + .map(|l| { + // Trigger data loading + let _: &L = (*l).data().unwrap(); + + // set storage to new one + mem::replace(&mut l.storage, Some(Rc::clone(&storage))); + + let filename = (*l).save(&l.filename).unwrap(); + let new_node = DatasetInfo { + filename: filename, + name: l.name.clone(), + metadata: l.metadata.clone(), + }; + new_node + }) + .collect(), + }; + + let file = File::create(path)?; + serde_json::to_writer(file, &info)?; - fn load>(path: P) -> Result<(), Error> { Ok(()) } + + pub fn from_path>(path: P) -> Result>, Error> { + let file = File::open(&path)?; + let mut reader = BufReader::new(file); + + // TODO: match with available Storage while we don't + // add a function to build a Storage from a StorageInfo + let mut basepath = PathBuf::new(); + basepath.push(path); + basepath.canonicalize()?; + + let linear = + LinearIndex::>::from_reader(&mut reader, &basepath.parent().unwrap())?; + Ok(linear) + } + + pub fn from_reader(rdr: &mut R, path: P) -> Result>, Error> + where + R: Read, + P: AsRef, + { + // TODO: check https://serde.rs/enum-representations.html for a + // solution for loading v4 and v5 + let linear: LinearInfo = serde_json::from_reader(rdr)?; + + // TODO: support other storages + let mut st: FSStorage = (&linear.storage.args).into(); + st.set_base(path.as_ref().to_str().unwrap()); + let storage: Rc = Rc::new(st); + + Ok(LinearIndex { + storage: Some(Rc::clone(&storage)), + datasets: linear + .leaves + .into_iter() + .map(|l| { + let new_node = Dataset { + filename: l.filename, + name: l.name, + metadata: l.metadata, + storage: Some(Rc::clone(&storage)), + data: Rc::new(Lazy::new()), + }; + new_node + }) + .collect(), + }) + } + + pub fn storage(&self) -> Option> { + self.storage.clone() + } } diff --git a/src/index/mod.rs b/src/index/mod.rs new file mode 100644 index 000000000..2e9f1fa4e --- /dev/null +++ b/src/index/mod.rs @@ -0,0 +1,245 @@ +//! # Indexing structures for fast similarity search +//! +//! An index organizes signatures to allow for fast similarity search. +//! Some indices also support containment searches. + +pub mod bigsi; +pub mod linear; +pub mod sbt; + +pub mod storage; + +pub mod search; + +use std::path::Path; +use std::rc::Rc; + +use serde_derive::{Deserialize, Serialize}; + +use failure::Error; +use lazy_init::Lazy; +use typed_builder::TypedBuilder; + +use crate::index::sbt::{Node, SBT}; +use crate::index::search::{search_minhashes, search_minhashes_containment}; +use crate::index::storage::{ReadData, ReadDataError, Storage}; +use crate::signature::Signature; +use crate::sketch::nodegraph::Nodegraph; +use crate::sketch::ukhs::{FlatUKHS, UKHSTrait}; +use crate::sketch::Sketch; + +pub type MHBT = SBT, Dataset>; +pub type UKHSTree = SBT, Dataset>; + +pub trait Index { + type Item; + + fn find( + &self, + search_fn: F, + sig: &Self::Item, + threshold: f64, + ) -> Result, Error> + where + F: Fn(&dyn Comparable, &Self::Item, f64) -> bool; + + fn search( + &self, + sig: &Self::Item, + threshold: f64, + containment: bool, + ) -> Result, Error> { + if containment { + self.find(search_minhashes_containment, sig, threshold) + } else { + self.find(search_minhashes, sig, threshold) + } + } + + //fn gather(&self, sig: &Self::Item, threshold: f64) -> Result, Error>; + + fn insert(&mut self, node: &Self::Item) -> Result<(), Error>; + + fn save>(&self, path: P) -> Result<(), Error>; + + fn load>(path: P) -> Result<(), Error>; + + fn datasets(&self) -> Vec; +} + +// TODO: split into two traits, Similarity and Containment? +pub trait Comparable { + fn similarity(&self, other: &O) -> f64; + fn containment(&self, other: &O) -> f64; +} + +impl<'a, N, L> Comparable for &'a N +where + N: Comparable, +{ + fn similarity(&self, other: &L) -> f64 { + (*self).similarity(&other) + } + + fn containment(&self, other: &L) -> f64 { + (*self).containment(&other) + } +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct DatasetInfo { + pub filename: String, + pub name: String, + pub metadata: String, +} + +#[derive(TypedBuilder, Default, Clone)] +pub struct Dataset +where + T: std::marker::Sync, +{ + pub(crate) filename: String, + pub(crate) name: String, + pub(crate) metadata: String, + + pub(crate) storage: Option>, + + pub(crate) data: Rc>, +} + +impl Dataset +where + T: std::marker::Sync + Default, +{ + pub fn name(&self) -> String { + self.name.clone() + } +} + +impl std::fmt::Debug for Dataset +where + T: std::marker::Sync, +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "Dataset [filename: {}, name: {}, metadata: {}]", + self.filename, self.name, self.metadata + ) + } +} + +impl ReadData for Dataset { + fn data(&self) -> Result<&Signature, Error> { + if let Some(sig) = self.data.get() { + Ok(sig) + } else if let Some(storage) = &self.storage { + let sig = self.data.get_or_create(|| { + let raw = storage.load(&self.filename).unwrap(); + let sigs: Result, _> = serde_json::from_reader(&mut &raw[..]); + if let Ok(sigs) = sigs { + // TODO: select the right sig? + sigs[0].to_owned() + } else { + let sig: Signature = serde_json::from_reader(&mut &raw[..]).unwrap(); + sig + } + }); + + Ok(sig) + } else { + Err(ReadDataError::LoadError.into()) + } + } +} + +impl Dataset { + pub fn count_common(&self, other: &Dataset) -> u64 { + let ng: &Signature = self.data().unwrap(); + let ong: &Signature = other.data().unwrap(); + + // TODO: select the right signatures... + // TODO: better matching here, what if it is not a mh? + if let Sketch::MinHash(mh) = &ng.signatures[0] { + if let Sketch::MinHash(omh) = &ong.signatures[0] { + return mh.count_common(&omh).unwrap() as u64; + } + } + unimplemented!(); + } + + pub fn mins(&self) -> Vec { + let ng: &Signature = self.data().unwrap(); + + // TODO: select the right signatures... + // TODO: better matching here, what if it is not a mh? + if let Sketch::MinHash(mh) = &ng.signatures[0] { + mh.mins.to_vec() + } else { + unimplemented!() + } + } +} + +impl From> for Signature { + fn from(other: Dataset) -> Signature { + other.data.get().unwrap().to_owned() + } +} + +impl From for Dataset { + fn from(other: Signature) -> Dataset { + let name = other.name(); + let filename = other.filename(); + + let data = Lazy::new(); + data.get_or_create(|| other); + + Dataset::builder() + .name(name) + .filename(filename) + .data(data) + .metadata("") + .storage(None) + .build() + } +} + +impl Comparable> for Dataset { + fn similarity(&self, other: &Dataset) -> f64 { + let ng: &Signature = self.data().unwrap(); + let ong: &Signature = other.data().unwrap(); + + // TODO: select the right signatures... + // TODO: better matching here, what if it is not a mh? + if let Sketch::MinHash(mh) = &ng.signatures[0] { + if let Sketch::MinHash(omh) = &ong.signatures[0] { + return mh.compare(&omh).unwrap(); + } + } + + if let Sketch::UKHS(mh) = &ng.signatures[0] { + if let Sketch::UKHS(omh) = &ong.signatures[0] { + return 1. - mh.distance(&omh); + } + } + + unimplemented!() + } + + fn containment(&self, other: &Dataset) -> f64 { + let ng: &Signature = self.data().unwrap(); + let ong: &Signature = other.data().unwrap(); + + // TODO: select the right signatures... + // TODO: better matching here, what if it is not a mh? + if let Sketch::MinHash(mh) = &ng.signatures[0] { + if let Sketch::MinHash(omh) = &ong.signatures[0] { + let common = mh.count_common(&omh).unwrap(); + let size = mh.mins.len(); + return common as f64 / size as f64; + } + } + unimplemented!() + } +} diff --git a/src/index/sbt.rs b/src/index/sbt.rs deleted file mode 100644 index fa159c41b..000000000 --- a/src/index/sbt.rs +++ /dev/null @@ -1,602 +0,0 @@ -use std::collections::{HashMap, HashSet}; -use std::fs::File; -use std::hash::{BuildHasherDefault, Hasher}; -use std::io::{BufReader, Read}; -use std::iter::FromIterator; -use std::path::{Path, PathBuf}; -use std::rc::Rc; - -use derive_builder::Builder; -use failure::Error; -use lazy_init::Lazy; -use serde_derive::Deserialize; - -use crate::index::nodegraph::Nodegraph; -use crate::index::storage::{FSStorage, ReadData, Storage, StorageInfo}; -use crate::index::{Comparable, Index, Leaf, LeafInfo}; -use crate::Signature; - -pub type MHBT = SBT, Leaf>; - -#[derive(Builder)] -pub struct SBT { - #[builder(default = "2")] - d: u32, - - storage: Rc, - - #[builder(setter(skip))] - factory: Factory, - - nodes: HashMap, - - leaves: HashMap, -} - -impl SBT -where - L: std::clone::Clone, -{ - #[inline(always)] - fn parent(&self, pos: u64) -> Option { - if pos == 0 { - None - } else { - Some((pos - 1) / (u64::from(self.d))) - } - } - - #[inline(always)] - fn child(&self, parent: u64, pos: u64) -> u64 { - u64::from(self.d) * parent + pos + 1 - } - - #[inline(always)] - fn children(&self, pos: u64) -> Vec { - (0..u64::from(self.d)).map(|c| self.child(pos, c)).collect() - } - - pub fn leaves(&self) -> Vec { - self.leaves.values().cloned().collect() - } - - pub fn storage(&self) -> Rc { - Rc::clone(&self.storage) - } - - // combine -} - -impl SBT, Leaf> -where - T: std::marker::Sync, - U: std::marker::Sync, -{ - pub fn from_reader(rdr: &mut R, path: P) -> Result, Leaf>, Error> - where - R: Read, - P: AsRef, - { - let sbt: SBTInfo = serde_json::from_reader(rdr)?; - - // TODO: match with available Storage while we don't - // add a function to build a Storage from a StorageInfo - let mut basepath = PathBuf::new(); - basepath.push(path); - basepath.push(&sbt.storage.args["path"]); - - let storage: Rc = Rc::new(FSStorage { basepath }); - - Ok(SBT { - d: sbt.d, - factory: sbt.factory, - storage: Rc::clone(&storage), - nodes: sbt - .nodes - .into_iter() - .map(|(n, l)| { - let new_node = Node { - filename: l.filename, - name: l.name, - metadata: l.metadata, - storage: Some(Rc::clone(&storage)), - data: Rc::new(Lazy::new()), - }; - (n, new_node) - }) - .collect(), - leaves: sbt - .leaves - .into_iter() - .map(|(n, l)| { - let new_node = Leaf { - filename: l.filename, - name: l.name, - metadata: l.metadata, - storage: Some(Rc::clone(&storage)), - data: Rc::new(Lazy::new()), - }; - (n, new_node) - }) - .collect(), - }) - } - - pub fn from_path>(path: P) -> Result, Leaf>, Error> { - let file = File::open(&path)?; - let mut reader = BufReader::new(file); - - // TODO: match with available Storage while we don't - // add a function to build a Storage from a StorageInfo - let mut basepath = PathBuf::new(); - basepath.push(path); - basepath.canonicalize()?; - - let sbt = SBT::, Leaf>::from_reader(&mut reader, &basepath.parent().unwrap())?; - Ok(sbt) - } -} - -impl Index for SBT -where - N: Comparable + Comparable, - L: Comparable + std::clone::Clone, -{ - type Item = L; - - fn find(&self, search_fn: F, sig: &L, threshold: f64) -> Result, Error> - where - F: Fn(&dyn Comparable, &Self::Item, f64) -> bool, - { - let mut matches = Vec::new(); - let mut visited = HashSet::new(); - let mut queue = vec![0u64]; - - while !queue.is_empty() { - let pos = queue.pop().unwrap(); - if !visited.contains(&pos) { - visited.insert(pos); - - if let Some(node) = self.nodes.get(&pos) { - if search_fn(&node, sig, threshold) { - for c in self.children(pos) { - queue.push(c); - } - } - } else if let Some(leaf) = self.leaves.get(&pos) { - if search_fn(leaf, sig, threshold) { - matches.push(leaf); - } - } - } - } - - Ok(matches) - } - - fn insert(&mut self, node: &L) {} - - fn save>(&self, path: P) -> Result<(), Error> { - Ok(()) - } - - fn load>(path: P) -> Result<(), Error> { - Ok(()) - } -} - -#[derive(Builder, Clone, Default, Deserialize)] -pub struct Factory { - class: String, - args: Vec, -} - -#[derive(Builder, Default, Clone)] -pub struct Node -where - T: std::marker::Sync, -{ - filename: String, - name: String, - metadata: HashMap, - storage: Option>, - #[builder(setter(skip))] - pub(crate) data: Rc>, -} - -impl Comparable> for Node { - fn similarity(&self, other: &Node) -> f64 { - if let Some(storage) = &self.storage { - let ng: &Nodegraph = self.data(&**storage).unwrap(); - let ong: &Nodegraph = other.data(&**storage).unwrap(); - ng.similarity(&ong) - } else { - // TODO: in this case storage is not set up, - // so we should throw an error? - 0.0 - } - } - - fn containment(&self, other: &Node) -> f64 { - if let Some(storage) = &self.storage { - let ng: &Nodegraph = self.data(&**storage).unwrap(); - let ong: &Nodegraph = other.data(&**storage).unwrap(); - ng.containment(&ong) - } else { - // TODO: in this case storage is not set up, - // so we should throw an error? - 0.0 - } - } -} - -impl Comparable> for Node { - fn similarity(&self, other: &Leaf) -> f64 { - if let Some(storage) = &self.storage { - let ng: &Nodegraph = self.data(&**storage).unwrap(); - let oth: &Signature = other.data(&**storage).unwrap(); - - // TODO: select the right signatures... - let sig = &oth.signatures[0]; - if sig.size() == 0 { - return 0.0; - } - - let matches: usize = sig.mins.iter().map(|h| ng.get(*h)).sum(); - - let min_n_below = self.metadata["min_n_below"] as f64; - - // This overestimates the similarity, but better than truncating too - // soon and losing matches - matches as f64 / min_n_below - } else { - // TODO: throw error, storage not initialized - 0.0 - } - } - - fn containment(&self, other: &Leaf) -> f64 { - if let Some(storage) = &self.storage { - let ng: &Nodegraph = self.data(&**storage).unwrap(); - let oth: &Signature = other.data(&**storage).unwrap(); - - // TODO: select the right signatures... - let sig = &oth.signatures[0]; - if sig.size() == 0 { - return 0.0; - } - - let matches: usize = sig.mins.iter().map(|h| ng.get(*h)).sum(); - - matches as f64 / sig.size() as f64 - } else { - // TODO: throw error, storage not initialized - 0.0 - } - } -} - -impl ReadData for Node { - fn data(&self, storage: &S) -> Result<&Nodegraph, Error> { - Ok(self.data.get_or_create(|| { - let raw = storage.load(&self.filename).unwrap(); - Nodegraph::from_reader(&mut &raw[..]).unwrap() - })) - } -} - -#[derive(Deserialize)] -struct NodeInfo { - filename: String, - name: String, - metadata: HashMap, -} - -#[derive(Deserialize)] -struct SBTInfo { - d: u32, - version: u32, - storage: StorageInfo, - factory: Factory, - nodes: HashMap, - leaves: HashMap, -} - -// This comes from finch -pub struct NoHashHasher(u64); - -impl Default for NoHashHasher { - #[inline] - fn default() -> NoHashHasher { - NoHashHasher(0x0) - } -} - -impl Hasher for NoHashHasher { - #[inline] - fn write(&mut self, bytes: &[u8]) { - *self = NoHashHasher( - (u64::from(bytes[0]) << 24) - + (u64::from(bytes[1]) << 16) - + (u64::from(bytes[2]) << 8) - + u64::from(bytes[3]), - ); - } - fn finish(&self) -> u64 { - self.0 - } -} - -type HashIntersection = HashSet>; - -enum BinaryTree { - Empty, - Internal(Box>), - Leaf(Box>>), -} - -struct TreeNode { - element: T, - left: BinaryTree, - right: BinaryTree, -} - -pub fn scaffold(mut datasets: Vec>) -> SBT, Leaf> -where - N: std::marker::Sync + std::clone::Clone + std::default::Default, -{ - let mut leaves: HashMap> = HashMap::with_capacity(datasets.len()); - - let mut next_round = Vec::new(); - - // generate two bottom levels: - // - datasets - // - first level of internal nodes - eprintln!("Start processing leaves"); - while !datasets.is_empty() { - let next_leaf = datasets.pop().unwrap(); - - let (simleaf_tree, in_common) = if datasets.is_empty() { - ( - BinaryTree::Empty, - HashIntersection::from_iter(next_leaf.mins().into_iter()), - ) - } else { - let mut similar_leaf_pos = 0; - let mut current_max = 0; - for (pos, leaf) in datasets.iter().enumerate() { - let common = next_leaf.count_common(leaf); - if common > current_max { - current_max = common; - similar_leaf_pos = pos; - } - } - - let similar_leaf = datasets.remove(similar_leaf_pos); - - let in_common = HashIntersection::from_iter(next_leaf.mins().into_iter()) - .union(&HashIntersection::from_iter( - similar_leaf.mins().into_iter(), - )) - .cloned() - .collect(); - - let simleaf_tree = BinaryTree::Leaf(Box::new(TreeNode { - element: similar_leaf, - left: BinaryTree::Empty, - right: BinaryTree::Empty, - })); - (simleaf_tree, in_common) - }; - - let leaf_tree = BinaryTree::Leaf(Box::new(TreeNode { - element: next_leaf, - left: BinaryTree::Empty, - right: BinaryTree::Empty, - })); - - let tree = BinaryTree::Internal(Box::new(TreeNode { - element: in_common, - left: leaf_tree, - right: simleaf_tree, - })); - - next_round.push(tree); - - if next_round.len() % 100 == 0 { - eprintln!("Processed {} leaves", next_round.len() * 2); - } - } - eprintln!("Finished processing leaves"); - - // while we don't get to the root, generate intermediary levels - while next_round.len() != 1 { - next_round = BinaryTree::process_internal_level(next_round); - eprintln!("Finished processing round {}", next_round.len()); - } - - // Convert from binary tree to nodes/leaves - let root = next_round.pop().unwrap(); - let mut visited = HashSet::new(); - let mut queue = vec![(0u64, root)]; - - while !queue.is_empty() { - let (pos, cnode) = queue.pop().unwrap(); - if !visited.contains(&pos) { - visited.insert(pos); - - match cnode { - BinaryTree::Leaf(leaf) => { - leaves.insert(pos, leaf.element); - } - BinaryTree::Internal(mut node) => { - let left = std::mem::replace(&mut node.left, BinaryTree::Empty); - let right = std::mem::replace(&mut node.right, BinaryTree::Empty); - queue.push((2 * pos + 1, left)); - queue.push((2 * pos + 2, right)); - } - BinaryTree::Empty => (), - } - } - } - - // save the new tree - - let storage: Rc = Rc::new(FSStorage { - basepath: ".sbt".into(), - }); - - SBTBuilder::default() - .storage(storage) - .nodes(HashMap::default()) - .leaves(leaves) - .build() - .unwrap() -} - -impl BinaryTree { - fn process_internal_level(mut current_round: Vec) -> Vec { - let mut next_round = Vec::with_capacity(current_round.len() + 1); - - while !current_round.is_empty() { - let next_node = current_round.pop().unwrap(); - - let similar_node = if current_round.is_empty() { - BinaryTree::Empty - } else { - let mut similar_node_pos = 0; - let mut current_max = 0; - for (pos, cmpe) in current_round.iter().enumerate() { - let common = BinaryTree::intersection_size(&next_node, &cmpe); - if common > current_max { - current_max = common; - similar_node_pos = pos; - } - } - current_round.remove(similar_node_pos) - }; - - let tree = BinaryTree::new_tree(next_node, similar_node); - - next_round.push(tree); - } - next_round - } - - fn new_tree(mut left: BinaryTree, mut right: BinaryTree) -> BinaryTree { - let in_common = if let BinaryTree::Internal(ref mut el1) = left { - match right { - BinaryTree::Internal(ref mut el2) => { - let c1 = std::mem::replace(&mut el1.element, HashIntersection::default()); - let c2 = std::mem::replace(&mut el2.element, HashIntersection::default()); - c1.union(&c2).cloned().collect() - } - BinaryTree::Empty => { - std::mem::replace(&mut el1.element, HashIntersection::default()) - } - _ => panic!("Should not see a Leaf at this level"), - } - } else { - HashIntersection::default() - }; - - BinaryTree::Internal(Box::new(TreeNode { - element: in_common, - left, - right, - })) - } - - fn intersection_size(n1: &BinaryTree, n2: &BinaryTree) -> usize { - if let BinaryTree::Internal(ref el1) = n1 { - if let BinaryTree::Internal(ref el2) = n2 { - return el1.element.intersection(&el2.element).count(); - } - }; - 0 - } -} - -#[cfg(test)] -mod test { - use super::*; - use crate::index::linear::{LinearIndex, LinearIndexBuilder}; - use crate::index::search::{search_minhashes, search_minhashes_containment}; - - #[test] - fn load_sbt() { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("tests/test-data/v5.sbt.json"); - - let sbt = MHBT::from_path(filename).expect("Loading error"); - - assert_eq!(sbt.d, 2); - //assert_eq!(sbt.storage.backend, "FSStorage"); - //assert_eq!(sbt.storage.args["path"], ".sbt.v5"); - assert_eq!(sbt.factory.class, "GraphFactory"); - assert_eq!(sbt.factory.args, [1, 100000, 4]); - - println!("sbt leaves {:?} {:?}", sbt.leaves.len(), sbt.leaves); - - let leaf = &sbt.leaves[&7]; - - let results = sbt.find(search_minhashes, &leaf, 0.5).unwrap(); - assert_eq!(results.len(), 1); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = sbt.find(search_minhashes, &leaf, 0.1).unwrap(); - assert_eq!(results.len(), 2); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let mut linear = LinearIndexBuilder::default() - .storage(Rc::clone(&sbt.storage) as Rc) - .build() - .unwrap(); - for l in &sbt.leaves { - linear.insert(l.1); - } - - println!( - "linear leaves {:?} {:?}", - linear.leaves.len(), - linear.leaves - ); - - let results = linear.find(search_minhashes, &leaf, 0.5).unwrap(); - assert_eq!(results.len(), 1); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = linear.find(search_minhashes, &leaf, 0.1).unwrap(); - assert_eq!(results.len(), 2); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = linear - .find(search_minhashes_containment, &leaf, 0.5) - .unwrap(); - assert_eq!(results.len(), 2); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - - let results = linear - .find(search_minhashes_containment, &leaf, 0.1) - .unwrap(); - assert_eq!(results.len(), 4); - println!("results: {:?}", results); - println!("leaf: {:?}", leaf); - } - - #[test] - fn scaffold_sbt() { - let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("tests/test-data/v5.sbt.json"); - - let sbt = MHBT::from_path(filename).expect("Loading error"); - - let new_sbt: MHBT = scaffold(sbt.leaves()); - assert_eq!(new_sbt.leaves().len(), 7); - } -} diff --git a/src/index/sbt/mhbt.rs b/src/index/sbt/mhbt.rs new file mode 100644 index 000000000..46f710e52 --- /dev/null +++ b/src/index/sbt/mhbt.rs @@ -0,0 +1,110 @@ +use std::io::Write; + +use failure::Error; + +use crate::index::sbt::{FromFactory, Node, Update, SBT}; +use crate::index::storage::{ReadData, ReadDataError, ToWriter}; +use crate::index::{Comparable, Dataset}; +use crate::signature::{Signature, SigsTrait}; +use crate::sketch::nodegraph::Nodegraph; +use crate::sketch::Sketch; + +impl ToWriter for Nodegraph { + fn to_writer(&self, writer: &mut W) -> Result<(), Error> + where + W: Write, + { + self.save_to_writer(writer) + } +} + +impl FromFactory> for SBT, L> { + fn factory(&self, _name: &str) -> Result, Error> { + unimplemented!() + } +} + +impl Update> for Node { + fn update(&self, _other: &mut Node) -> Result<(), Error> { + unimplemented!(); + } +} + +impl Update> for Dataset { + fn update(&self, _other: &mut Node) -> Result<(), Error> { + unimplemented!(); + } +} + +impl Comparable> for Node { + fn similarity(&self, other: &Node) -> f64 { + let ng: &Nodegraph = self.data().unwrap(); + let ong: &Nodegraph = other.data().unwrap(); + ng.similarity(&ong) + } + + fn containment(&self, other: &Node) -> f64 { + let ng: &Nodegraph = self.data().unwrap(); + let ong: &Nodegraph = other.data().unwrap(); + ng.containment(&ong) + } +} + +impl Comparable> for Node { + fn similarity(&self, other: &Dataset) -> f64 { + let ng: &Nodegraph = self.data().unwrap(); + let oth: &Signature = other.data().unwrap(); + + // TODO: select the right signatures... + if let Sketch::MinHash(sig) = &oth.signatures[0] { + if sig.size() == 0 { + return 0.0; + } + + let matches: usize = sig.mins.iter().map(|h| ng.get(*h)).sum(); + + let min_n_below = self.metadata["min_n_below"] as f64; + + // This overestimates the similarity, but better than truncating too + // soon and losing matches + matches as f64 / min_n_below + } else { + //TODO what if it is not a minhash? + unimplemented!() + } + } + + fn containment(&self, other: &Dataset) -> f64 { + let ng: &Nodegraph = self.data().unwrap(); + let oth: &Signature = other.data().unwrap(); + + // TODO: select the right signatures... + if let Sketch::MinHash(sig) = &oth.signatures[0] { + if sig.size() == 0 { + return 0.0; + } + + let matches: usize = sig.mins.iter().map(|h| ng.get(*h)).sum(); + + matches as f64 / sig.size() as f64 + } else { + //TODO what if it is not a minhash? + unimplemented!() + } + } +} + +impl ReadData for Node { + fn data(&self) -> Result<&Nodegraph, Error> { + if let Some(storage) = &self.storage { + Ok(self.data.get_or_create(|| { + let raw = storage.load(&self.filename).unwrap(); + Nodegraph::from_reader(&mut &raw[..]).unwrap() + })) + } else if let Some(data) = self.data.get() { + Ok(data) + } else { + Err(ReadDataError::LoadError.into()) + } + } +} diff --git a/src/index/sbt/mod.rs b/src/index/sbt/mod.rs new file mode 100644 index 000000000..89ded5819 --- /dev/null +++ b/src/index/sbt/mod.rs @@ -0,0 +1,924 @@ +pub mod mhbt; +pub mod ukhs; + +use std::collections::hash_map::Entry; +use std::collections::{HashMap, HashSet}; +use std::fmt::Debug; +use std::fs::File; +use std::hash::{BuildHasherDefault, Hasher}; +use std::io::{BufReader, Read}; +use std::iter::FromIterator; +use std::mem; +use std::path::{Path, PathBuf}; +use std::rc::Rc; + +use failure::Error; +use lazy_init::Lazy; +use serde_derive::{Deserialize, Serialize}; +use typed_builder::TypedBuilder; + +use crate::index::storage::{FSStorage, ReadData, Storage, StorageInfo, ToWriter}; +use crate::index::{Comparable, Dataset, DatasetInfo, Index}; +use crate::signature::Signature; + +pub trait Update { + fn update(&self, other: &mut O) -> Result<(), Error>; +} + +pub trait FromFactory { + fn factory(&self, name: &str) -> Result; +} + +#[derive(TypedBuilder)] +pub struct SBT { + #[builder(default = 2)] + d: u32, + + #[builder(default)] + storage: Option>, + + #[builder(default_code = r#"Factory::GraphFactory { args: (1, 100000.0, 4) }"#)] + factory: Factory, + + #[builder(default_code = "HashMap::default()")] + nodes: HashMap, + + #[builder(default_code = "HashMap::default()")] + leaves: HashMap, +} + +const fn parent(pos: u64, d: u64) -> u64 { + ((pos - 1) / d) as u64 +} + +const fn child(parent: u64, pos: u64, d: u64) -> u64 { + d * parent + pos + 1 +} + +impl SBT +where + L: std::clone::Clone + Default, + N: Default, +{ + #[inline(always)] + fn parent(&self, pos: u64) -> Option { + if pos == 0 { + None + } else { + Some(parent(pos, u64::from(self.d))) + } + } + + #[inline(always)] + fn child(&self, parent: u64, pos: u64) -> u64 { + child(parent, pos, u64::from(self.d)) + } + + #[inline(always)] + fn children(&self, pos: u64) -> Vec { + (0..u64::from(self.d)).map(|c| self.child(pos, c)).collect() + } + + pub fn storage(&self) -> Option> { + self.storage.clone() + } + + // combine +} + +impl SBT, Dataset> +where + T: std::marker::Sync + ToWriter, + U: std::marker::Sync + ToWriter, + Node: ReadData, + Dataset: ReadData, +{ + fn parse_v4(rdr: &mut R) -> Result + where + R: Read, + { + let sinfo: SBTInfoV4 = serde_json::from_reader(rdr)?; + Ok(SBTInfo::V4(sinfo)) + } + + fn parse_v5(rdr: &mut R) -> Result + where + R: Read, + { + let sinfo: SBTInfoV5 = serde_json::from_reader(rdr)?; + Ok(SBTInfo::V5(sinfo)) + } + + pub fn from_reader(rdr: &mut R, path: P) -> Result, Dataset>, Error> + where + R: Read, + P: AsRef, + { + // TODO: I would love to do this, but I get an untagged enum error with + // SBTInfo... + //let sinfo: SBTInfo = serde_json::from_reader(rdr)?; + + let mut s = String::new(); + rdr.read_to_string(&mut s)?; + + let sinfo = + Self::parse_v5(&mut s.as_bytes()).or_else(|_| Self::parse_v4(&mut s.as_bytes()))?; + + // TODO: support other storages + let mut st: FSStorage = match sinfo { + SBTInfo::V4(ref sbt) => (&sbt.storage.args).into(), + SBTInfo::V5(ref sbt) => (&sbt.storage.args).into(), + }; + st.set_base(path.as_ref().to_str().unwrap()); + let storage: Rc = Rc::new(st); + + let d = match sinfo { + SBTInfo::V4(ref sbt) => sbt.d, + SBTInfo::V5(ref sbt) => sbt.d, + }; + + let factory = match sinfo { + SBTInfo::V4(ref sbt) => sbt.factory.clone(), + SBTInfo::V5(ref sbt) => sbt.factory.clone(), + }; + + let (nodes, leaves) = match sinfo { + SBTInfo::V5(sbt) => { + let nodes = sbt + .nodes + .into_iter() + .map(|(n, l)| { + let new_node = Node { + filename: l.filename, + name: l.name, + metadata: l.metadata, + storage: Some(Rc::clone(&storage)), + data: Rc::new(Lazy::new()), + }; + (n, new_node) + }) + .collect(); + let leaves = sbt + .leaves + .into_iter() + .map(|(n, l)| { + let new_node = Dataset { + filename: l.filename, + name: l.name, + metadata: l.metadata, + storage: Some(Rc::clone(&storage)), + data: Rc::new(Lazy::new()), + }; + (n, new_node) + }) + .collect(); + (nodes, leaves) + } + SBTInfo::V4(sbt) => { + let nodes = sbt + .nodes + .iter() + .filter_map(|(n, x)| match x { + NodeInfoV4::Node(l) => { + let new_node = Node { + filename: l.filename.clone(), + name: l.name.clone(), + metadata: l.metadata.clone(), + storage: Some(Rc::clone(&storage)), + data: Rc::new(Lazy::new()), + }; + Some((*n, new_node)) + } + NodeInfoV4::Dataset(_) => None, + }) + .collect(); + + let leaves = sbt + .nodes + .into_iter() + .filter_map(|(n, x)| match x { + NodeInfoV4::Node(_) => None, + NodeInfoV4::Dataset(l) => { + let new_node = Dataset { + filename: l.filename, + name: l.name, + metadata: l.metadata, + storage: Some(Rc::clone(&storage)), + data: Rc::new(Lazy::new()), + }; + Some((n, new_node)) + } + }) + .collect(); + + (nodes, leaves) + } + }; + + Ok(SBT { + d, + factory, + storage: Some(Rc::clone(&storage)), + nodes, + leaves, + }) + } + + pub fn from_path>(path: P) -> Result, Dataset>, Error> { + let file = File::open(&path)?; + let mut reader = BufReader::new(file); + + // TODO: match with available Storage while we don't + // add a function to build a Storage from a StorageInfo + let mut basepath = PathBuf::new(); + basepath.push(path); + basepath.canonicalize()?; + + let sbt = + SBT::, Dataset>::from_reader(&mut reader, &basepath.parent().unwrap())?; + Ok(sbt) + } + + pub fn save_file>( + &mut self, + path: P, + storage: Option>, + ) -> Result<(), Error> { + let ref_path = path.as_ref(); + let mut basename = ref_path.file_name().unwrap().to_str().unwrap().to_owned(); + if basename.ends_with(".sbt.json") { + basename = basename.replace(".sbt.json", ""); + } + let location = ref_path.parent().unwrap(); + + let storage = match storage { + Some(s) => s, + None => { + let subdir = format!(".sbt.{}", basename); + Rc::new(FSStorage::new(location.to_str().unwrap(), &subdir)) + } + }; + + let args = storage.args(); + let storage_info = StorageInfo { + backend: "FSStorage".into(), + args, + }; + + let info: SBTInfoV5 = SBTInfoV5 { + d: self.d, + factory: self.factory.clone(), + storage: storage_info, + version: 5, + nodes: self + .nodes + .iter_mut() + .map(|(n, l)| { + // Trigger data loading + let _: &U = (*l).data().expect("Couldn't load data"); + + // set storage to new one + mem::replace(&mut l.storage, Some(Rc::clone(&storage))); + + let filename = (*l).save(&l.filename).unwrap(); + let new_node = NodeInfo { + filename: filename, + name: l.name.clone(), + metadata: l.metadata.clone(), + }; + (*n, new_node) + }) + .collect(), + leaves: self + .leaves + .iter_mut() + .map(|(n, l)| { + // Trigger data loading + let _: &T = (*l).data().unwrap(); + + // set storage to new one + mem::replace(&mut l.storage, Some(Rc::clone(&storage))); + + let filename = (*l).save(&l.filename).unwrap(); + let new_node = DatasetInfo { + filename: filename, + name: l.name.clone(), + metadata: l.metadata.clone(), + }; + (*n, new_node) + }) + .collect(), + }; + + let file = File::create(path)?; + serde_json::to_writer(file, &info)?; + + Ok(()) + } +} + +impl Index for SBT +where + N: Comparable + Comparable + Update + Debug + Default, + L: Comparable + Update + Clone + Debug + Default, + SBT: FromFactory, +{ + type Item = L; + + fn find(&self, search_fn: F, sig: &L, threshold: f64) -> Result, Error> + where + F: Fn(&dyn Comparable, &Self::Item, f64) -> bool, + { + let mut matches = Vec::new(); + let mut visited = HashSet::new(); + let mut queue = vec![0u64]; + + while !queue.is_empty() { + let pos = queue.pop().unwrap(); + if !visited.contains(&pos) { + visited.insert(pos); + + if let Some(node) = self.nodes.get(&pos) { + if search_fn(&node, sig, threshold) { + for c in self.children(pos) { + queue.push(c); + } + } + } else if let Some(leaf) = self.leaves.get(&pos) { + if search_fn(leaf, sig, threshold) { + matches.push(leaf); + } + } + } + } + + Ok(matches) + } + + fn insert(&mut self, dataset: &L) -> Result<(), Error> { + if self.leaves.is_empty() { + // in this case the tree is empty, + // just add the dataset to the first available leaf + self.leaves.entry(0).or_insert(dataset.clone()); + return Ok(()); + } + + // we can unwrap here because the root node case + // only happens on an empty tree, and if we got + // to this point we have at least one leaf already. + // TODO: find position by similarity search + let pos = self.leaves.keys().max().unwrap() + 1; + let parent_pos = self.parent(pos).unwrap(); + + if let Entry::Occupied(pnode) = self.leaves.entry(parent_pos) { + // Case 1: parent is a Leaf + // create a new internal node, add it to self.nodes[parent_pos] + + let (_, leaf) = pnode.remove_entry(); + + let mut new_node = self.factory(&format!("internal.{}", parent_pos))?; + + // for each children update the parent node + // TODO: write the update method + leaf.update(&mut new_node)?; + dataset.update(&mut new_node)?; + + // node and parent are children of new internal node + let mut c_pos = self.children(parent_pos).into_iter().take(2); + let c1_pos = c_pos.next().unwrap(); + let c2_pos = c_pos.next().unwrap(); + + self.leaves.entry(c1_pos).or_insert(leaf); + self.leaves.entry(c2_pos).or_insert(dataset.clone()); + + // add the new internal node to self.nodes[parent_pos) + // TODO check if it is really empty? + self.nodes.entry(parent_pos).or_insert(new_node); + } else { + // TODO: moved these two lines here to avoid borrow checker + // error E0502 in the Vacant case, but would love to avoid it! + let mut new_node = self.factory(&format!("internal.{}", parent_pos))?; + let c_pos = self.children(parent_pos)[0]; + + match self.nodes.entry(parent_pos) { + // Case 2: parent is a node and has an empty child spot available + // (if there isn't an empty spot, it was already covered by case 1) + Entry::Occupied(mut pnode) => { + dataset.update(&mut pnode.get_mut())?; + self.leaves.entry(pos).or_insert(dataset.clone()); + } + + // Case 3: parent is None/empty + // this can happen with d != 2, need to create parent node + Entry::Vacant(pnode) => { + self.leaves.entry(c_pos).or_insert(dataset.clone()); + dataset.update(&mut new_node)?; + pnode.insert(new_node); + } + } + } + + let mut parent_pos = parent_pos; + while let Some(ppos) = self.parent(parent_pos) { + if let Entry::Occupied(mut pnode) = self.nodes.entry(parent_pos) { + //TODO: use children for this node to update, instead of dragging + // dataset up to the root? It would be more generic, but this + // works for minhash, draff signatures and nodegraphs... + dataset.update(&mut pnode.get_mut())?; + } + parent_pos = ppos; + } + + Ok(()) + } + + fn save>(&self, _path: P) -> Result<(), Error> { + unimplemented!() + } + + fn load>(_path: P) -> Result<(), Error> { + unimplemented!() + } + + fn datasets(&self) -> Vec { + self.leaves.values().cloned().collect() + } +} + +/* +#[derive(TypedBuilder, Clone, Default, Serialize, Deserialize)] +pub struct Factory { + class: String, + args: Vec, +} +*/ + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(tag = "class")] +pub enum Factory { + GraphFactory { args: (u64, f64, u64) }, +} + +#[derive(TypedBuilder, Default, Clone)] +pub struct Node +where + T: std::marker::Sync, +{ + filename: String, + name: String, + metadata: HashMap, + storage: Option>, + #[builder(default)] + pub(crate) data: Rc>, +} + +impl Node +where + T: Sync + ToWriter, +{ + pub fn save(&self, path: &str) -> Result { + if let Some(storage) = &self.storage { + if let Some(data) = self.data.get() { + let mut buffer = Vec::new(); + data.to_writer(&mut buffer)?; + + Ok(storage.save(path, &buffer)?) + } else { + // TODO throw error, data was not initialized + unimplemented!() + } + } else { + unimplemented!() + } + } +} + +impl Dataset +where + T: Sync + ToWriter, +{ + pub fn save(&self, path: &str) -> Result { + if let Some(storage) = &self.storage { + if let Some(data) = self.data.get() { + let mut buffer = Vec::new(); + data.to_writer(&mut buffer)?; + + Ok(storage.save(path, &buffer)?) + } else { + unimplemented!() + } + } else { + unimplemented!() + } + } +} + +impl std::fmt::Debug for Node +where + T: std::marker::Sync + std::fmt::Debug, +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "Node [name={}, filename={}, metadata: {:?}, data: {:?}]", + self.name, + self.filename, + self.metadata, + self.data.get().is_some() + ) + } +} + +#[derive(Serialize, Deserialize, Debug)] +struct NodeInfo { + filename: String, + name: String, + metadata: HashMap, +} + +#[derive(Serialize, Deserialize, Debug)] +#[serde(untagged)] +enum NodeInfoV4 { + Node(NodeInfo), + Dataset(DatasetInfo), +} + +#[derive(Serialize, Deserialize)] +struct SBTInfoV4 { + d: u32, + version: u32, + storage: StorageInfo, + factory: Factory, + nodes: HashMap, +} + +#[derive(Serialize, Deserialize)] +struct SBTInfoV5 { + d: u32, + version: u32, + storage: StorageInfo, + factory: Factory, + nodes: HashMap, + leaves: HashMap, +} + +#[derive(Deserialize)] +#[serde(untagged)] +enum SBTInfo { + V5(SBTInfoV5), + V4(SBTInfoV4), +} + +// This comes from finch +pub struct NoHashHasher(u64); + +impl Default for NoHashHasher { + #[inline] + fn default() -> NoHashHasher { + NoHashHasher(0x0) + } +} + +impl Hasher for NoHashHasher { + #[inline] + fn write(&mut self, bytes: &[u8]) { + *self = NoHashHasher( + (u64::from(bytes[0]) << 24) + + (u64::from(bytes[1]) << 16) + + (u64::from(bytes[2]) << 8) + + u64::from(bytes[3]), + ); + } + fn finish(&self) -> u64 { + self.0 + } +} + +type HashIntersection = HashSet>; + +enum BinaryTree { + Empty, + Internal(Box>), + Dataset(Box>>), +} + +struct TreeNode { + element: T, + left: BinaryTree, + right: BinaryTree, +} + +pub fn scaffold( + mut datasets: Vec>, + storage: Option>, +) -> SBT, Dataset> +where + N: std::marker::Sync + std::clone::Clone + std::default::Default, +{ + let mut leaves: HashMap> = HashMap::with_capacity(datasets.len()); + + let mut next_round = Vec::new(); + + // generate two bottom levels: + // - datasets + // - first level of internal nodes + eprintln!("Start processing leaves"); + while !datasets.is_empty() { + let next_leaf = datasets.pop().unwrap(); + + let (simleaf_tree, in_common) = if datasets.is_empty() { + ( + BinaryTree::Empty, + HashIntersection::from_iter(next_leaf.mins().into_iter()), + ) + } else { + let mut similar_leaf_pos = 0; + let mut current_max = 0; + for (pos, leaf) in datasets.iter().enumerate() { + let common = next_leaf.count_common(leaf); + if common > current_max { + current_max = common; + similar_leaf_pos = pos; + } + } + + let similar_leaf = datasets.remove(similar_leaf_pos); + + let in_common = HashIntersection::from_iter(next_leaf.mins().into_iter()) + .union(&HashIntersection::from_iter( + similar_leaf.mins().into_iter(), + )) + .cloned() + .collect(); + + let simleaf_tree = BinaryTree::Dataset(Box::new(TreeNode { + element: similar_leaf, + left: BinaryTree::Empty, + right: BinaryTree::Empty, + })); + (simleaf_tree, in_common) + }; + + let leaf_tree = BinaryTree::Dataset(Box::new(TreeNode { + element: next_leaf, + left: BinaryTree::Empty, + right: BinaryTree::Empty, + })); + + let tree = BinaryTree::Internal(Box::new(TreeNode { + element: in_common, + left: leaf_tree, + right: simleaf_tree, + })); + + next_round.push(tree); + + if next_round.len() % 100 == 0 { + eprintln!("Processed {} leaves", next_round.len() * 2); + } + } + eprintln!("Finished processing leaves"); + + // while we don't get to the root, generate intermediary levels + while next_round.len() != 1 { + next_round = BinaryTree::process_internal_level(next_round); + eprintln!("Finished processing round {}", next_round.len()); + } + + // Convert from binary tree to nodes/leaves + let root = next_round.pop().unwrap(); + let mut visited = HashSet::new(); + let mut queue = vec![(0u64, root)]; + + while !queue.is_empty() { + let (pos, cnode) = queue.pop().unwrap(); + if !visited.contains(&pos) { + visited.insert(pos); + + match cnode { + BinaryTree::Dataset(leaf) => { + leaves.insert(pos, leaf.element); + } + BinaryTree::Internal(mut node) => { + let left = std::mem::replace(&mut node.left, BinaryTree::Empty); + let right = std::mem::replace(&mut node.right, BinaryTree::Empty); + queue.push((2 * pos + 1, left)); + queue.push((2 * pos + 2, right)); + } + BinaryTree::Empty => (), + } + } + } + + SBT::builder() + .storage(storage) + .nodes(HashMap::default()) + .leaves(leaves) + .build() +} + +impl BinaryTree { + fn process_internal_level(mut current_round: Vec) -> Vec { + let mut next_round = Vec::with_capacity(current_round.len() + 1); + + while !current_round.is_empty() { + let next_node = current_round.pop().unwrap(); + + let similar_node = if current_round.is_empty() { + BinaryTree::Empty + } else { + let mut similar_node_pos = 0; + let mut current_max = 0; + for (pos, cmpe) in current_round.iter().enumerate() { + let common = BinaryTree::intersection_size(&next_node, &cmpe); + if common > current_max { + current_max = common; + similar_node_pos = pos; + } + } + current_round.remove(similar_node_pos) + }; + + let tree = BinaryTree::new_tree(next_node, similar_node); + + next_round.push(tree); + } + next_round + } + + fn new_tree(mut left: BinaryTree, mut right: BinaryTree) -> BinaryTree { + let in_common = if let BinaryTree::Internal(ref mut el1) = left { + match right { + BinaryTree::Internal(ref mut el2) => { + let c1 = std::mem::replace(&mut el1.element, HashIntersection::default()); + let c2 = std::mem::replace(&mut el2.element, HashIntersection::default()); + c1.union(&c2).cloned().collect() + } + BinaryTree::Empty => { + std::mem::replace(&mut el1.element, HashIntersection::default()) + } + _ => panic!("Should not see a Dataset at this level"), + } + } else { + HashIntersection::default() + }; + + BinaryTree::Internal(Box::new(TreeNode { + element: in_common, + left, + right, + })) + } + + fn intersection_size(n1: &BinaryTree, n2: &BinaryTree) -> usize { + if let BinaryTree::Internal(ref el1) = n1 { + if let BinaryTree::Internal(ref el2) = n2 { + return el1.element.intersection(&el2.element).count(); + } + }; + 0 + } +} + +#[cfg(test)] +mod test { + use std::fs::File; + use std::io::{BufReader, Seek, SeekFrom}; + use std::path::PathBuf; + use std::rc::Rc; + use tempfile; + + use assert_matches::assert_matches; + use lazy_init::Lazy; + + use super::{scaffold, Factory}; + + use crate::index::linear::LinearIndex; + use crate::index::search::{search_minhashes, search_minhashes_containment}; + use crate::index::{Dataset, Index, MHBT}; + use crate::signature::Signature; + + #[test] + fn save_sbt() { + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("tests/test-data/v5.sbt.json"); + + let mut sbt = MHBT::from_path(filename).expect("Loading error"); + + let mut tmpfile = tempfile::NamedTempFile::new().unwrap(); + sbt.save_file(tmpfile.path(), None).unwrap(); + + tmpfile.seek(SeekFrom::Start(0)).unwrap(); + } + + #[test] + fn load_sbt() { + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("tests/test-data/v5.sbt.json"); + + let sbt = MHBT::from_path(filename).expect("Loading error"); + + assert_eq!(sbt.d, 2); + //assert_eq!(sbt.storage.backend, "FSStorage"); + //assert_eq!(sbt.storage.args["path"], ".sbt.v5"); + //assert_matches!(&sbt.storage, ::FSStorage(args) => { + // assert_eq!(args, &[1, 100000, 4]); + //}); + assert_matches!(&sbt.factory, Factory::GraphFactory { args } => { + assert_eq!(args, &(1, 100000.0, 4)); + }); + + println!("sbt leaves {:?} {:?}", sbt.leaves.len(), sbt.leaves); + + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("tests/test-data/.sbt.v3/60f7e23c24a8d94791cc7a8680c493f9"); + + let mut reader = BufReader::new(File::open(filename).unwrap()); + let sigs = Signature::load_signatures(&mut reader, 31, Some("DNA".into()), None).unwrap(); + let sig_data = sigs[0].clone(); + + let data = Lazy::new(); + data.get_or_create(|| sig_data); + + let leaf = Dataset::builder() + .data(Rc::new(data)) + .filename("") + .name("") + .metadata("") + .storage(None) + .build(); + + let results = sbt.find(search_minhashes, &leaf, 0.5).unwrap(); + assert_eq!(results.len(), 1); + println!("results: {:?}", results); + println!("leaf: {:?}", leaf); + + let results = sbt.find(search_minhashes, &leaf, 0.1).unwrap(); + assert_eq!(results.len(), 2); + println!("results: {:?}", results); + println!("leaf: {:?}", leaf); + + let mut linear = LinearIndex::builder().storage(sbt.storage()).build(); + for l in &sbt.leaves { + linear.insert(l.1).unwrap(); + } + + println!( + "linear leaves {:?} {:?}", + linear.datasets.len(), + linear.datasets + ); + + let results = linear.find(search_minhashes, &leaf, 0.5).unwrap(); + assert_eq!(results.len(), 1); + println!("results: {:?}", results); + println!("leaf: {:?}", leaf); + + let results = linear.find(search_minhashes, &leaf, 0.1).unwrap(); + assert_eq!(results.len(), 2); + println!("results: {:?}", results); + println!("leaf: {:?}", leaf); + + let results = linear + .find(search_minhashes_containment, &leaf, 0.5) + .unwrap(); + assert_eq!(results.len(), 2); + println!("results: {:?}", results); + println!("leaf: {:?}", leaf); + + let results = linear + .find(search_minhashes_containment, &leaf, 0.1) + .unwrap(); + assert_eq!(results.len(), 4); + println!("results: {:?}", results); + println!("leaf: {:?}", leaf); + } + + #[test] + fn scaffold_sbt() { + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("tests/test-data/v5.sbt.json"); + + let sbt = MHBT::from_path(filename).expect("Loading error"); + + let new_sbt: MHBT = scaffold(sbt.datasets(), sbt.storage()); + + assert_eq!(new_sbt.datasets().len(), 7); + } + + #[test] + fn load_v4() { + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("tests/test-data/v4.sbt.json"); + + let _sbt = MHBT::from_path(filename).expect("Loading error"); + } + + #[test] + fn load_v5() { + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("tests/test-data/v5.sbt.json"); + + let _sbt = MHBT::from_path(filename).expect("Loading error"); + } +} diff --git a/src/index/sbt/ukhs.rs b/src/index/sbt/ukhs.rs new file mode 100644 index 000000000..2fac9c729 --- /dev/null +++ b/src/index/sbt/ukhs.rs @@ -0,0 +1,129 @@ +use std::collections::HashMap; +use std::mem; +use std::rc::Rc; + +use failure::Error; +use lazy_init::Lazy; + +use crate::index::sbt::{FromFactory, Node, Update, SBT}; +use crate::index::storage::{ReadData, ReadDataError}; +use crate::index::{Comparable, Dataset}; +use crate::signature::Signature; +use crate::sketch::ukhs::{FlatUKHS, UKHSTrait}; +use crate::sketch::Sketch; + +impl FromFactory> for SBT, L> { + fn factory(&self, name: &str) -> Result, Error> { + let data = Lazy::new(); + // TODO: don't hardcode this! + data.get_or_create(|| FlatUKHS::new(9, 31).unwrap()); + + Ok(Node { + name: name.into(), + filename: name.into(), + metadata: HashMap::default(), + storage: self.storage.clone(), + data: Rc::new(data), + }) + } +} + +impl Update> for Node { + fn update(&self, _other: &mut Node) -> Result<(), Error> { + unimplemented!(); + } +} + +impl Update> for Dataset { + fn update(&self, other: &mut Node) -> Result<(), Error> { + let data = &self.data()?; + + let sigs = if data.signatures.len() > 1 { + // TODO: select the right signatures... + unimplemented!() + } else { + &data.signatures[0] + }; + + if let Sketch::UKHS(sig) = sigs { + let mut data: FlatUKHS = other.data()?.clone(); + data.merge(sig); + + // TODO update metadata? + + let new_data = Lazy::new(); + new_data.get_or_create(|| data); + + mem::replace(&mut other.data, Rc::new(new_data)); + return Ok(()); + } + unimplemented!() + } +} + +impl Comparable> for Node { + fn similarity(&self, other: &Node) -> f64 { + let o_sig: &FlatUKHS = other.data().unwrap(); + let me_sig: &FlatUKHS = self.data().unwrap(); + 1.0 - me_sig.distance(o_sig) + } + + fn containment(&self, _other: &Node) -> f64 { + unimplemented!(); + } +} + +impl Comparable> for Node { + fn similarity(&self, other: &Dataset) -> f64 { + let odata = other.data().unwrap(); + + if odata.signatures.len() > 1 { + // TODO: select the right signatures... + unimplemented!() + } else if let Sketch::UKHS(o_sig) = &odata.signatures[0] { + // This is doing a variation of Weighted Jaccard. + // The internal nodes are built with max(l_i, r_i) for each + // left and right children, so if we do a WJ similarity directly + // we will underestimate it. + // + // Instead of doing sum(mins) / sum(max), we do instead + // sum(mins) / sum(sig_buckets), which might overestimate + // but will never loose any leaf. + // TODO: is this weighted containment? + // + // TODO: Still need to test with larger trees! Does it save any + // comparisons? + let me_sig: &FlatUKHS = self.data().unwrap(); + let mins: u64 = me_sig + .buckets() + .zip(o_sig.buckets()) + .map(|(a, b)| u64::min(*a, *b)) + .sum(); + let maxs: u64 = o_sig.buckets().sum(); + + mins as f64 / maxs as f64 + } else { + // TODO: sig[0] was not a UKHS + unimplemented!() + } + } + + fn containment(&self, _other: &Dataset) -> f64 { + unimplemented!(); + } +} + +impl ReadData for Node { + fn data(&self) -> Result<&FlatUKHS, Error> { + if let Some(storage) = &self.storage { + Ok(self.data.get_or_create(|| { + let raw = storage.load(&self.filename).unwrap(); + FlatUKHS::from_reader(&mut &raw[..]).unwrap() + })) + } else if let Some(data) = self.data.get() { + Ok(data) + } else { + Err(ReadDataError::LoadError.into()) + } + } +} diff --git a/src/index/search.rs b/src/index/search.rs index 5a301cdc4..8164a28f3 100644 --- a/src/index/search.rs +++ b/src/index/search.rs @@ -11,3 +11,24 @@ pub fn search_minhashes_containment( ) -> bool { node.containment(query) > threshold } + +pub fn search_minhashes_find_best() -> fn(&dyn Comparable, &L, f64) -> bool { + /* TODO: implement the proper function, as a closure that modifies `best_so_far` + let mut _best_so_far = 0.; + + move |node, query, threshold| { + let sim = node.similarity(query); + if sim > best_so_far { + best_so_far = sim; + true + } else { + if sim > threshold { + true + } else { + false + } + } + } + */ + unimplemented!(); +} diff --git a/src/index/storage.rs b/src/index/storage.rs index 962bc9e9d..184a6805c 100644 --- a/src/index/storage.rs +++ b/src/index/storage.rs @@ -1,50 +1,132 @@ -use std::collections::HashMap; -use std::fs::File; -use std::io::{BufReader, Read}; +use std::fs::{DirBuilder, File}; +use std::io::{BufReader, BufWriter, Read, Write}; use std::path::PathBuf; -use derive_builder::Builder; -use failure::Error; -use serde_derive::Deserialize; +use failure::{Error, Fail}; +use serde_derive::{Deserialize, Serialize}; +use typed_builder::TypedBuilder; + +#[derive(Debug, Fail)] +pub enum StorageError { + #[fail(display = "Path can't be empty")] + EmptyPathError, +} + +#[derive(Debug, Fail)] +pub enum ReadDataError { + #[fail(display = "Could not load data")] + LoadError, +} /// Implemented by anything that wants to read specific data from a storage. -pub trait ReadData { - fn data(&self, storage: &S) -> Result<&D, Error>; +pub trait ReadData { + fn data(&self) -> Result<&D, Error>; } -#[derive(Deserialize)] +#[derive(Serialize, Deserialize)] pub(crate) struct StorageInfo { - backend: String, - pub(crate) args: HashMap, + pub(crate) backend: String, + pub(crate) args: StorageArgs, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(untagged)] +pub enum StorageArgs { + FSStorage { path: String }, +} + +impl From<&StorageArgs> for FSStorage { + fn from(other: &StorageArgs) -> FSStorage { + match other { + StorageArgs::FSStorage { path } => { + let mut fullpath = PathBuf::new(); + fullpath.push("."); + fullpath.push(path); + + FSStorage { + fullpath: fullpath, + subdir: path.clone(), + } + } + } + } } /// An abstraction for any place where we can store data. pub trait Storage { /// Save bytes into path - fn save(&mut self, path: &str, content: &[u8]) -> Result<(), Error>; + fn save(&self, path: &str, content: &[u8]) -> Result; /// Load bytes from path fn load(&self, path: &str) -> Result, Error>; + + /// Args for initializing a new Storage + fn args(&self) -> StorageArgs; } /// Store files locally into a directory -#[derive(Builder, Debug, Clone, Default)] +#[derive(TypedBuilder, Debug, Clone, Default)] pub struct FSStorage { /// absolute path for the directory where data is saved. - pub(crate) basepath: PathBuf, + pub(crate) fullpath: PathBuf, + pub(crate) subdir: String, +} + +impl FSStorage { + pub fn new(location: &str, subdir: &str) -> FSStorage { + let mut fullpath = PathBuf::new(); + fullpath.push(location); + fullpath.push(subdir); + + FSStorage { + fullpath, + subdir: subdir.into(), + } + } + + pub fn set_base(&mut self, location: &str) { + let mut fullpath = PathBuf::new(); + fullpath.push(location); + fullpath.push(&self.subdir); + self.fullpath = fullpath; + } } impl Storage for FSStorage { - fn save(&mut self, path: &str, content: &[u8]) -> Result<(), Error> { - Ok(()) + fn save(&self, path: &str, content: &[u8]) -> Result { + if path.is_empty() { + return Err(StorageError::EmptyPathError.into()); + } + + let fpath = self.fullpath.join(path); + DirBuilder::new() + .recursive(true) + .create(fpath.parent().unwrap())?; + + let file = File::create(&fpath)?; + let mut buf_writer = BufWriter::new(file); + buf_writer.write(content)?; + Ok(path.into()) } fn load(&self, path: &str) -> Result, Error> { - let path = self.basepath.join(path); + let path = self.fullpath.join(path); let file = File::open(path)?; let mut buf_reader = BufReader::new(file); let mut contents = Vec::new(); buf_reader.read_to_end(&mut contents)?; Ok(contents) } + + fn args(&self) -> StorageArgs { + StorageArgs::FSStorage { + path: self.subdir.clone(), + } + } +} + +pub trait ToWriter { + fn to_writer(&self, writer: &mut W) -> Result<(), Error> + where + W: Write; } diff --git a/src/lib.rs b/src/lib.rs index 637b73da2..63ce3c1b2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,21 @@ +//! # Compute, compare and search signatures for nucleotide (DNA/RNA) and protein sequences. +//! +//! sourmash is a command-line tool and Python library for computing +//! [MinHash sketches][0] from DNA sequences, comparing them to each other, +//! and plotting the results. +//! This allows you to estimate sequence similarity between even very +//! large data sets quickly and accurately. +//! +//! [0]: https://en.wikipedia.org/wiki/MinHash +//! +//! sourmash can be used to quickly search large databases of genomes +//! for matches to query genomes and metagenomes. +//! +//! sourmash also includes k-mer based taxonomic exploration and +//! classification routines for genome and metagenome analysis. These +//! routines can use the NCBI taxonomy but do not depend on it in any way. +//! Documentation and further examples for each module can be found in the module descriptions below. + pub mod errors; #[macro_use] @@ -5,812 +23,27 @@ pub mod utils; pub mod index; +pub mod signature; +pub mod sketch; + #[cfg(feature = "from-finch")] pub mod from; -use serde::de::{Deserialize, Deserializer}; -use serde::ser::{Serialize, SerializeStruct, Serializer}; -use serde_derive::{Deserialize, Serialize}; - -use std::cmp::Ordering; -use std::collections::HashMap; -use std::fs::File; -use std::io; -use std::iter::{Iterator, Peekable}; -use std::path::Path; -use std::str; +pub mod cmd; use cfg_if::cfg_if; -use failure::Error; -use lazy_static::lazy_static; use murmurhash3::murmurhash3_x64_128; -use crate::errors::SourmashError; - cfg_if! { if #[cfg(target_arch = "wasm32")] { - use wasm_bindgen::prelude::*; - pub mod wasm; } else { pub mod ffi; } } +type HashIntoType = u64; + pub fn _hash_murmur(kmer: &[u8], seed: u64) -> u64 { murmurhash3_x64_128(kmer, seed).0 } - -#[cfg_attr(target_arch = "wasm32", wasm_bindgen)] -#[derive(Debug, Clone, PartialEq)] -pub struct KmerMinHash { - pub num: u32, - pub ksize: u32, - pub is_protein: bool, - pub seed: u64, - pub max_hash: u64, - mins: Vec, - abunds: Option>, -} - -impl Default for KmerMinHash { - fn default() -> KmerMinHash { - KmerMinHash { - num: 1000, - ksize: 21, - is_protein: false, - seed: 42, - max_hash: 0, - mins: Vec::with_capacity(1000), - abunds: None, - } - } -} - -impl Serialize for KmerMinHash { - fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { - let n_fields = match &self.abunds { - Some(_) => 8, - _ => 7, - }; - - let mut md5_ctx = md5::Context::new(); - md5_ctx.consume(&self.ksize.to_string()); - self.mins - .iter() - .map(|x| md5_ctx.consume(x.to_string())) - .count(); - - let mut partial = serializer.serialize_struct("KmerMinHash", n_fields)?; - partial.serialize_field("num", &self.num)?; - partial.serialize_field("ksize", &self.ksize)?; - partial.serialize_field("seed", &self.seed)?; - partial.serialize_field("max_hash", &self.max_hash)?; - partial.serialize_field("mins", &self.mins)?; - - partial.serialize_field("md5sum", &format!("{:x}", md5_ctx.compute()))?; - - if let Some(abunds) = &self.abunds { - partial.serialize_field("abundances", abunds)?; - } - - partial.serialize_field( - "molecule", - match &self.is_protein { - true => "protein", - false => "DNA", - }, - )?; - - partial.end() - } -} - -impl<'de> Deserialize<'de> for KmerMinHash { - fn deserialize(deserializer: D) -> Result - where - D: Deserializer<'de>, - { - #[derive(Deserialize)] - struct TempSig { - num: u32, - ksize: u32, - seed: u64, - max_hash: u64, - md5sum: String, - mins: Vec, - abundances: Option>, - molecule: String, - } - - let tmpsig = TempSig::deserialize(deserializer)?; - - let num = if tmpsig.max_hash != 0 { 0 } else { tmpsig.num }; - - Ok(KmerMinHash { - num, - ksize: tmpsig.ksize, - seed: tmpsig.seed, - max_hash: tmpsig.max_hash, - mins: tmpsig.mins, - abunds: tmpsig.abundances, - is_protein: match tmpsig.molecule.as_ref() { - "protein" => true, - "DNA" => false, - _ => false, // TODO: throw error - }, - }) - } -} - -impl KmerMinHash { - pub fn new( - num: u32, - ksize: u32, - is_protein: bool, - seed: u64, - max_hash: u64, - track_abundance: bool, - ) -> KmerMinHash { - let mins: Vec; - let abunds: Option>; - - if num > 0 { - mins = Vec::with_capacity(num as usize); - } else { - mins = Vec::with_capacity(1000); - } - - if track_abundance { - abunds = Some(Vec::with_capacity(mins.capacity())); - } else { - abunds = None - } - - KmerMinHash { - num, - ksize, - is_protein, - seed, - max_hash, - mins, - abunds, - } - } - - pub fn check_compatible(&self, other: &KmerMinHash) -> Result { - if self.ksize != other.ksize { - return Err(SourmashError::MismatchKSizes.into()); - } - if self.is_protein != other.is_protein { - return Err(SourmashError::MismatchDNAProt.into()); - } - if self.max_hash != other.max_hash { - return Err(SourmashError::MismatchMaxHash.into()); - } - if self.seed != other.seed { - return Err(SourmashError::MismatchSeed.into()); - } - Ok(true) - } - - pub fn add_hash(&mut self, hash: u64) { - let current_max = match self.mins.last() { - Some(&x) => x, - None => u64::max_value(), - }; - - if hash <= self.max_hash || self.max_hash == 0 { - // empty? add it, if within range / no range specified. - if self.mins.is_empty() { - self.mins.push(hash); - if let Some(ref mut abunds) = self.abunds { - abunds.push(1); - } - return; - } else if hash <= self.max_hash - || current_max > hash - || (self.mins.len() as u32) < self.num - { - // "good" hash - within range, smaller than current entry, or - // still have space available - let pos = match self.mins.binary_search(&hash) { - Ok(p) => p, - Err(p) => p, - }; - - if pos == self.mins.len() { - // at end - must still be growing, we know the list won't - // get too long - self.mins.push(hash); - if let Some(ref mut abunds) = self.abunds { - abunds.push(1); - } - } else if self.mins[pos] != hash { - // didn't find hash in mins, so inserting somewhere - // in the middle; shrink list if needed. - self.mins.insert(pos, hash); - if let Some(ref mut abunds) = self.abunds { - abunds.insert(pos, 1); - } - - // is it too big now? - if self.num != 0 && self.mins.len() > (self.num as usize) { - self.mins.pop(); - if let Some(ref mut abunds) = self.abunds { - abunds.pop(); - } - } - } else if let Some(ref mut abunds) = self.abunds { - // pos == hash: hash value already in mins, inc count - abunds[pos] += 1; - } - } - } - } - - pub fn add_word(&mut self, word: &[u8]) { - let hash = _hash_murmur(word, self.seed); - self.add_hash(hash); - } - - pub fn add_sequence(&mut self, seq: &[u8], force: bool) -> Result<(), Error> { - let sequence: Vec = seq - .iter() - .map(|&x| (x as char).to_ascii_uppercase() as u8) - .collect(); - if sequence.len() >= (self.ksize as usize) { - if !self.is_protein { - // dna - for kmer in sequence.windows(self.ksize as usize) { - if _checkdna(kmer) { - let rc = revcomp(kmer); - if kmer < &rc { - self.add_word(kmer); - } else { - self.add_word(&rc); - } - } else if !force { - return Err(SourmashError::InvalidDNA { - message: String::from_utf8(kmer.to_vec()).unwrap(), - } - .into()); - } - } - } else { - // protein - let rc = revcomp(&sequence); - let aa_ksize = self.ksize / 3; - - for i in 0..3 { - let substr: Vec = sequence - .iter() - .cloned() - .skip(i) - .take(sequence.len() - i) - .collect(); - let aa = to_aa(&substr); - - aa.windows(aa_ksize as usize) - .map(|n| self.add_word(n)) - .count(); - - let rc_substr: Vec = - rc.iter().cloned().skip(i).take(rc.len() - i).collect(); - let aa_rc = to_aa(&rc_substr); - - aa_rc - .windows(aa_ksize as usize) - .map(|n| self.add_word(n)) - .count(); - } - } - } - Ok(()) - } - - pub fn merge(&mut self, other: &KmerMinHash) -> Result<(), Error> { - self.check_compatible(other)?; - let max_size = self.mins.len() + other.mins.len(); - let mut merged: Vec = Vec::with_capacity(max_size); - let mut merged_abunds: Vec = Vec::with_capacity(max_size); - - { - let mut self_iter = self.mins.iter(); - let mut other_iter = other.mins.iter(); - - let mut self_abunds_iter: Option>; - if let Some(ref mut abunds) = self.abunds { - self_abunds_iter = Some(abunds.iter()); - } else { - self_abunds_iter = None; - } - - let mut other_abunds_iter: Option>; - if let Some(ref abunds) = other.abunds { - other_abunds_iter = Some(abunds.iter()); - } else { - other_abunds_iter = None; - } - - let mut self_value = self_iter.next(); - let mut other_value = other_iter.next(); - while self_value.is_some() { - let value = self_value.unwrap(); - match other_value { - None => { - merged.push(*value); - merged.extend(self_iter); - if let Some(sai) = self_abunds_iter { - merged_abunds.extend(sai); - } - break; - } - Some(x) if x < value => { - merged.push(*x); - other_value = other_iter.next(); - - if let Some(ref mut oai) = other_abunds_iter { - if let Some(v) = oai.next() { - merged_abunds.push(*v) - } - } - } - Some(x) if x == value => { - merged.push(*x); - other_value = other_iter.next(); - self_value = self_iter.next(); - - if let Some(ref mut oai) = other_abunds_iter { - if let Some(v) = oai.next() { - if let Some(ref mut sai) = self_abunds_iter { - if let Some(s) = sai.next() { - merged_abunds.push(*v + *s) - } - } - } - } - } - Some(x) if x > value => { - merged.push(*value); - self_value = self_iter.next(); - - if let Some(ref mut sai) = self_abunds_iter { - if let Some(v) = sai.next() { - merged_abunds.push(*v) - } - } - } - Some(_) => {} - } - } - if let Some(value) = other_value { - merged.push(*value); - } - merged.extend(other_iter); - if let Some(oai) = other_abunds_iter { - merged_abunds.extend(oai); - } - } - - if merged.len() < (self.num as usize) || (self.num as usize) == 0 { - self.mins = merged; - self.abunds = Some(merged_abunds); - } else { - self.mins = merged - .iter() - .map(|&x| x as u64) - .take(self.num as usize) - .collect(); - self.abunds = Some(merged_abunds) // TODO: reduce this one too - } - Ok(()) - } - - pub fn add_from(&mut self, other: &KmerMinHash) -> Result<(), Error> { - for min in &other.mins { - self.add_hash(*min); - } - Ok(()) - } - - pub fn add_many(&mut self, hashes: &[u64]) -> Result<(), Error> { - for min in hashes { - self.add_hash(*min); - } - Ok(()) - } - - pub fn add_many_with_abund(&mut self, hashes: &[(u64, u64)]) -> Result<(), Error> { - for item in hashes { - for _i in 0..item.1 { - self.add_hash(item.0); - } - } - Ok(()) - } - - pub fn count_common(&self, other: &KmerMinHash) -> Result { - self.check_compatible(other)?; - let iter = Intersection { - left: self.mins.iter().peekable(), - right: other.mins.iter().peekable(), - }; - - Ok(iter.count() as u64) - } - - pub fn intersection(&self, other: &KmerMinHash) -> Result<(Vec, u64), Error> { - self.check_compatible(other)?; - - let mut combined_mh = KmerMinHash::new( - self.num, - self.ksize, - self.is_protein, - self.seed, - self.max_hash, - self.abunds.is_some(), - ); - - combined_mh.merge(&self)?; - combined_mh.merge(&other)?; - - let it1 = Intersection { - left: self.mins.iter().peekable(), - right: other.mins.iter().peekable(), - }; - - // TODO: there is probably a way to avoid this Vec here, - // and pass the it1 as left in it2. - let i1: Vec = it1.cloned().collect(); - let it2 = Intersection { - left: i1.iter().peekable(), - right: combined_mh.mins.iter().peekable(), - }; - - let common: Vec = it2.cloned().collect(); - Ok((common, combined_mh.mins.len() as u64)) - } - - pub fn intersection_size(&self, other: &KmerMinHash) -> Result<(u64, u64), Error> { - self.check_compatible(other)?; - - let mut combined_mh = KmerMinHash::new( - self.num, - self.ksize, - self.is_protein, - self.seed, - self.max_hash, - self.abunds.is_some(), - ); - - combined_mh.merge(&self)?; - combined_mh.merge(&other)?; - - let it1 = Intersection { - left: self.mins.iter().peekable(), - right: other.mins.iter().peekable(), - }; - - // TODO: there is probably a way to avoid this Vec here, - // and pass the it1 as left in it2. - let i1: Vec = it1.into_iter().cloned().collect(); - let it2 = Intersection { - left: i1.iter().peekable(), - right: combined_mh.mins.iter().peekable(), - }; - - Ok((it2.count() as u64, combined_mh.mins.len() as u64)) - } - - pub fn compare(&self, other: &KmerMinHash) -> Result { - self.check_compatible(other)?; - if let Ok((common, size)) = self.intersection_size(other) { - return Ok(common as f64 / u64::max(1, size) as f64); - } else { - return Ok(0.0); - } - } - - pub fn size(&self) -> usize { - self.mins.len() - } - - pub fn to_vec(&self) -> Vec { - self.mins.clone() - } -} - -struct Intersection> { - left: Peekable, - right: Peekable, -} - -impl> Iterator for Intersection { - type Item = T; - - fn next(&mut self) -> Option { - loop { - let res = match (self.left.peek(), self.right.peek()) { - (Some(ref left_key), Some(ref right_key)) => left_key.cmp(right_key), - _ => return None, - }; - - match res { - Ordering::Less => { - self.left.next(); - } - Ordering::Greater => { - self.right.next(); - } - Ordering::Equal => { - self.right.next(); - return self.left.next(); - } - } - } - } -} - -#[derive(Serialize, Deserialize, Debug, Clone)] -pub struct Signature { - #[serde(default = "default_class")] - pub class: String, - - #[serde(default)] - pub email: String, - pub hash_function: String, - - pub filename: Option, - pub name: Option, - - #[serde(default = "default_license")] - pub license: String, - - pub signatures: Vec, - - #[serde(default = "default_version")] - pub version: f64, -} - -fn default_license() -> String { - "CC0".to_string() -} - -fn default_class() -> String { - "sourmash_signature".to_string() -} - -fn default_version() -> f64 { - 0.4 -} - -impl Signature { - pub fn from_path>(path: P) -> Result, Error> { - let mut reader = io::BufReader::new(File::open(path)?); - Ok(Signature::from_reader(&mut reader)?) - } - - pub fn from_reader(rdr: &mut R) -> Result, Error> - where - R: io::Read, - { - let sigs: Vec = serde_json::from_reader(rdr)?; - Ok(sigs) - } - - pub fn load_signatures( - buf: &mut R, - ksize: usize, - moltype: Option<&str>, - scaled: Option, - ) -> Result, Error> - where - R: io::Read, - { - let orig_sigs = Signature::from_reader(buf)?; - - let flat_sigs = orig_sigs.into_iter().flat_map(|s| { - s.signatures - .iter() - .map(|mh| { - let mut new_s = s.clone(); - new_s.signatures = vec![mh.clone()]; - new_s - }) - .collect::>() - }); - - let filtered_sigs = flat_sigs.filter_map(|mut sig| { - let good_mhs: Vec = sig - .signatures - .into_iter() - .filter(|mh| { - if ksize == 0 || ksize == mh.ksize as usize { - match moltype { - Some(x) => { - if (x.to_lowercase() == "dna" && !mh.is_protein) - || (x.to_lowercase() == "protein" && mh.is_protein) - { - return true; - } - } - None => return true, - }; - }; - false - }) - .collect(); - - if good_mhs.is_empty() { - return None; - }; - - sig.signatures = good_mhs; - Some(sig) - }); - - Ok(filtered_sigs.collect()) - } -} - -impl Default for Signature { - fn default() -> Signature { - Signature { - class: default_class(), - email: "".to_string(), - hash_function: "0.murmur64".to_string(), - license: default_license(), - filename: None, - name: None, - signatures: Vec::::new(), - version: default_version(), - } - } -} - -impl PartialEq for Signature { - fn eq(&self, other: &Signature) -> bool { - let metadata = self.class == other.class - && self.email == other.email - && self.hash_function == other.hash_function - && self.filename == other.filename - && self.name == other.name; - - let mh = &self.signatures[0]; - let other_mh = &other.signatures[0]; - metadata && (mh == other_mh) - } -} - -#[inline] -fn revcomp(seq: &[u8]) -> Vec { - seq.iter() - .rev() - .map(|n| match *n as char { - 'A' | 'a' => 'T', - 'T' | 't' => 'A', - 'C' | 'c' => 'G', - 'G' | 'g' => 'C', - x => x, - } as u8) // TODO: error? - .collect() -} - -lazy_static! { - static ref CODONTABLE: HashMap<&'static str, u8> = { - let mut m = HashMap::new(); - - m.insert("TTT", b'F'); - m.insert("TTC", b'F'); - m.insert("TTA", b'L'); - m.insert("TTG", b'L'); - - m.insert("TCT", b'S'); - m.insert("TCC", b'S'); - m.insert("TCA", b'S'); - m.insert("TCG", b'S'); - - m.insert("TAT", b'Y'); - m.insert("TAC", b'Y'); - m.insert("TAA", b'*'); - m.insert("TAG", b'*'); - - m.insert("TGT", b'C'); - m.insert("TGC", b'C'); - m.insert("TGA", b'*'); - m.insert("TGG", b'W'); - - m.insert("CTT", b'L'); - m.insert("CTC", b'L'); - m.insert("CTA", b'L'); - m.insert("CTG", b'L'); - - m.insert("CCT", b'P'); - m.insert("CCC", b'P'); - m.insert("CCA", b'P'); - m.insert("CCG", b'P'); - - m.insert("CAT", b'H'); - m.insert("CAC", b'H'); - m.insert("CAA", b'Q'); - m.insert("CAG", b'Q'); - - m.insert("CGT", b'R'); - m.insert("CGC", b'R'); - m.insert("CGA", b'R'); - m.insert("CGG", b'R'); - - m.insert("ATT", b'I'); - m.insert("ATC", b'I'); - m.insert("ATA", b'I'); - m.insert("ATG", b'M'); - - m.insert("ACT", b'T'); - m.insert("ACC", b'T'); - m.insert("ACA", b'T'); - m.insert("ACG", b'T'); - - m.insert("AAT", b'N'); - m.insert("AAC", b'N'); - m.insert("AAA", b'K'); - m.insert("AAG", b'K'); - - m.insert("AGT", b'S'); - m.insert("AGC", b'S'); - m.insert("AGA", b'R'); - m.insert("AGG", b'R'); - - m.insert("GTT", b'V'); - m.insert("GTC", b'V'); - m.insert("GTA", b'V'); - m.insert("GTG", b'V'); - - m.insert("GCT", b'A'); - m.insert("GCC", b'A'); - m.insert("GCA", b'A'); - m.insert("GCG", b'A'); - - m.insert("GAT", b'D'); - m.insert("GAC", b'D'); - m.insert("GAA", b'E'); - m.insert("GAG", b'E'); - - m.insert("GGT", b'G'); - m.insert("GGC", b'G'); - m.insert("GGA", b'G'); - m.insert("GGG", b'G'); - - m - }; -} - -#[inline] -fn to_aa(seq: &[u8]) -> Vec { - let mut converted: Vec = Vec::with_capacity(seq.len() / 3); - - for chunk in seq.chunks(3) { - if chunk.len() != 3 { - break; - } - if let Some(codon) = CODONTABLE.get(str::from_utf8(chunk).unwrap()) { - converted.push(*codon); - } - } - - converted -} - -#[inline] -fn _checkdna(seq: &[u8]) -> bool { - for n in seq.iter() { - match *n as char { - 'A' | 'a' | 'C' | 'c' | 'G' | 'g' | 'T' | 't' => (), - _ => return false, - } - } - true -} diff --git a/src/main.rs b/src/main.rs deleted file mode 100644 index 8f1b00681..000000000 --- a/src/main.rs +++ /dev/null @@ -1,274 +0,0 @@ -use std::fs::File; -use std::io; -use std::path::Path; -use std::rc::Rc; - -use clap::{load_yaml, App}; -use exitfailure::ExitFailure; -use failure::{Error, ResultExt}; -use human_panic::setup_panic; -use log::{debug, error, info, LevelFilter}; - -use sourmash::index::nodegraph::Nodegraph; -use sourmash::index::sbt::{scaffold, Node, MHBT, SBT}; -use sourmash::index::search::search_minhashes; -use sourmash::index::{Index, Leaf, LeafBuilder}; -use sourmash::Signature; - -struct Query { - data: T, -} - -impl Query { - fn ksize(&self) -> u64 { - // TODO: this might panic - self.data.signatures[0].ksize as u64 - } - - fn moltype(&self) -> String { - // TODO: this might panic - if self.data.signatures[0].is_protein { - "protein".into() - } else { - "DNA".into() - } - } - - fn name(&self) -> String { - self.data.name.clone().unwrap() - } -} - -impl From> for Leaf { - fn from(other: Query) -> Leaf { - let leaf = LeafBuilder::default().build().unwrap(); - //leaf.data.get_or_create(|| data.query); - leaf - } -} - -fn load_query_signature( - query: &str, - ksize: usize, - moltype: Option<&str>, - scaled: Option, -) -> Result, Error> { - let mut reader = io::BufReader::new(File::open(query)?); - let sigs = Signature::load_signatures(&mut reader, ksize, moltype, scaled)?; - - debug!("{:?}", sigs); - // TODO: what if we have more than one left? - let data = sigs[0].clone(); - - Ok(Query { data }) -} - -struct Database { - data: MHBT, - path: String, - is_index: bool, -} - -fn load_sbts_and_sigs( - filenames: &[&str], - query: &Query, - containment: bool, - traverse: bool, -) -> Result, Error> { - let mut dbs = Vec::default(); - - let ksize = query.ksize(); - let moltype = query.moltype(); - - let mut n_signatures = 0; - let mut n_databases = 0; - - for path in filenames { - if traverse && Path::new(path).is_dir() { - continue; - } - - if let Ok(data) = MHBT::from_path(path) { - // TODO: check compatible - dbs.push(Database { - data, - path: String::from(*path), - is_index: true, - }); - info!("loaded SBT {}", path); - n_databases += 1; - continue; - } - - // TODO: load sig, need to change Database - } - - if n_signatures > 0 && n_databases > 0 { - info!( - "loaded {} signatures and {} databases total.", - n_signatures, n_databases - ); - } else if n_signatures > 0 { - info!("loaded {} signatures.", n_signatures); - } else if n_databases > 0 { - info!("loaded {} databases.", n_databases); - } else { - return Err(failure::err_msg("Couldn't load any databases")); - } - - Ok(dbs) -} - -struct Results { - similarity: f64, - match_sig: Signature, -} - -fn search_databases( - query: Query, - databases: &[Database], - threshold: f64, - containment: bool, - best_only: bool, - ignore_abundance: bool, -) -> Result, Error> { - let mut results = Vec::default(); - - let search_fn = search_minhashes; - let query_leaf = query.into(); - - for db in databases { - for dataset in db.data.find(search_fn, &query_leaf, threshold) {} - } - - Ok(results) -} - -fn main() -> Result<(), ExitFailure> { - //setup_panic!(); - - env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init(); - - let yml = load_yaml!("sourmash.yml"); - let m = App::from_yaml(yml).get_matches(); - - match m.subcommand_name() { - Some("scaffold") => { - let cmd = m.subcommand_matches("scaffold").unwrap(); - let sbt_file = cmd.value_of("current_sbt").unwrap(); - - let sbt = MHBT::from_path(sbt_file)?; - let new_sbt: MHBT = scaffold(sbt.leaves()); - - assert_eq!(new_sbt.leaves().len(), 100); - Ok(()) - } - Some("search") => { - let cmd = m.subcommand_matches("search").unwrap(); - - if cmd.is_present("quiet") { - log::set_max_level(LevelFilter::Warn); - } - - let query = load_query_signature( - cmd.value_of("query").unwrap(), - if cmd.is_present("ksize") { - cmd.value_of("ksize").unwrap().parse().unwrap() - } else { - 0 - }, - Some("dna"), // TODO: select moltype, - if cmd.is_present("scaled") { - Some(cmd.value_of("scaled").unwrap().parse().unwrap()) - } else { - None - }, - )?; - - info!( - "loaded query: {}... (k={}, {})", - query.name(), - query.ksize(), - query.moltype() - ); - - let containment = cmd.is_present("containment"); - let traverse_directory = cmd.is_present("traverse-directory"); - let databases = load_sbts_and_sigs( - &cmd.values_of("databases") - .map(|vals| vals.collect::>()) - .unwrap(), - &query, - containment, - traverse_directory, - )?; - - if databases.len() == 0 { - return Err(failure::err_msg("Nothing found to search!").into()); - } - - let best_only = cmd.is_present("best-only"); - let threshold = cmd.value_of("threshold").unwrap().parse().unwrap(); - let ignore_abundance = cmd.is_present("ignore-abundance"); - let results = search_databases( - query, - &databases, - threshold, - containment, - best_only, - ignore_abundance, - )?; - - let num_results = if best_only { - 1 - } else { - cmd.value_of("num-results").unwrap().parse().unwrap() - }; - - let n_matches = if num_results == 0 || results.len() < num_results { - println!("{} matches:", results.len()); - results.len() - } else { - println!("{} matches; showing first {}:", results.len(), num_results); - num_results - }; - - println!("similarity match"); - println!("---------- -----"); - for sr in &results[..n_matches] { - println!( - "{:>6.1}% {:60}", - sr.similarity * 100., - sr.match_sig.name.clone().unwrap() - ); - } - - if best_only { - info!("** reporting only one match because --best-only was set") - } - - /* - if args.output: - fieldnames = ['similarity', 'name', 'filename', 'md5'] - w = csv.DictWriter(args.output, fieldnames=fieldnames) - w.writeheader() - - for sr in &results: - d = dict(sr._asdict()) - del d['match_sig'] - w.writerow(d) - - if args.save_matches: - outname = args.save_matches.name - info!("saving all matched signatures to \"{}\"", outname) - Signature::save_signatures([sr.match_sig for sr in results], args.save_matches) - */ - - Ok(()) - } - _ => { - println!("{:?}", m); - Ok(()) - } - } -} diff --git a/src/signature.rs b/src/signature.rs new file mode 100644 index 000000000..2c10efc2b --- /dev/null +++ b/src/signature.rs @@ -0,0 +1,293 @@ +//! # Compressed representations of genomic data +//! +//! A signature is a collection of sketches for a genomic dataset. + +use std::fs::File; +use std::io; +use std::iter::Iterator; +use std::path::Path; +use std::str; + +use failure::Error; +use serde_derive::{Deserialize, Serialize}; +use typed_builder::TypedBuilder; + +use crate::errors::SourmashError; +use crate::index::storage::ToWriter; +use crate::sketch::Sketch; + +pub trait SigsTrait { + fn size(&self) -> usize; + fn to_vec(&self) -> Vec; + fn check_compatible(&self, other: &Self) -> Result<(), Error>; + fn add_sequence(&mut self, seq: &[u8], _force: bool) -> Result<(), Error>; + fn ksize(&self) -> usize; +} + +impl SigsTrait for Sketch { + fn size(&self) -> usize { + match *self { + Sketch::UKHS(ref ukhs) => ukhs.size(), + Sketch::MinHash(ref mh) => mh.size(), + } + } + + fn to_vec(&self) -> Vec { + match *self { + Sketch::UKHS(ref ukhs) => ukhs.to_vec(), + Sketch::MinHash(ref mh) => mh.to_vec(), + } + } + + fn ksize(&self) -> usize { + match *self { + Sketch::UKHS(ref ukhs) => ukhs.ksize(), + Sketch::MinHash(ref mh) => mh.ksize(), + } + } + + fn check_compatible(&self, other: &Self) -> Result<(), Error> { + match *self { + Sketch::UKHS(ref ukhs) => match other { + Sketch::UKHS(ref ot) => ukhs.check_compatible(ot), + _ => Err(SourmashError::MismatchSignatureType.into()), + }, + Sketch::MinHash(ref mh) => match other { + Sketch::MinHash(ref ot) => mh.check_compatible(ot), + _ => Err(SourmashError::MismatchSignatureType.into()), + }, + } + } + + fn add_sequence(&mut self, seq: &[u8], force: bool) -> Result<(), Error> { + match *self { + Sketch::UKHS(ref mut ukhs) => ukhs.add_sequence(seq, force), + Sketch::MinHash(ref mut mh) => mh.add_sequence(seq, force), + } + } +} + +#[derive(Serialize, Deserialize, Debug, Clone, TypedBuilder)] +pub struct Signature { + #[serde(default = "default_class")] + #[builder(default_code = "default_class()")] + pub class: String, + + #[serde(default)] + #[builder(default)] + pub email: String, + + pub hash_function: String, + + #[builder(default)] + pub filename: Option, + + pub name: Option, + + #[serde(default = "default_license")] + #[builder(default_code = "default_license()")] + pub license: String, + + pub signatures: Vec, + + #[serde(default = "default_version")] + #[builder(default_code = "default_version()")] + pub version: f64, +} + +fn default_license() -> String { + "CC0".to_string() +} + +fn default_class() -> String { + "sourmash_signature".to_string() +} + +fn default_version() -> f64 { + 0.4 +} + +impl Signature { + pub fn name(&self) -> String { + if let Some(name) = &self.name { + name.clone() + } else if let Some(filename) = &self.filename { + filename.clone() + } else { + // TODO md5sum case + unimplemented!() + } + } + + pub fn filename(&self) -> String { + if let Some(filename) = &self.filename { + filename.clone() + } else { + unimplemented!() + } + } + + pub fn md5sum(&self) -> String { + if self.signatures.len() == 1 { + match &self.signatures[0] { + Sketch::MinHash(mh) => mh.md5sum(), + Sketch::UKHS(hs) => hs.md5sum(), + } + } else { + // TODO: select the correct signature + unimplemented!() + } + } + + pub fn from_path>(path: P) -> Result, Error> { + let mut reader = io::BufReader::new(File::open(path)?); + Ok(Signature::from_reader(&mut reader)?) + } + + pub fn from_reader(rdr: &mut R) -> Result, Error> + where + R: io::Read, + { + let sigs: Vec = serde_json::from_reader(rdr)?; + Ok(sigs) + } + + pub fn load_signatures( + buf: &mut R, + ksize: usize, + moltype: Option<&str>, + _scaled: Option, + ) -> Result, Error> + where + R: io::Read, + { + let orig_sigs = Signature::from_reader(buf)?; + + let flat_sigs = orig_sigs.into_iter().flat_map(|s| { + s.signatures + .iter() + .map(|mh| { + let mut new_s = s.clone(); + new_s.signatures = vec![mh.clone()]; + new_s + }) + .collect::>() + }); + + let filtered_sigs = flat_sigs.filter_map(|mut sig| { + let good_mhs: Vec = sig + .signatures + .into_iter() + .filter(|sig| { + match sig { + Sketch::MinHash(mh) => { + if ksize == 0 || ksize == mh.ksize() as usize { + match moltype { + Some(x) => { + if (x.to_lowercase() == "dna" && !mh.is_protein()) + || (x.to_lowercase() == "protein" && mh.is_protein()) + { + return true; + } + } + None => unimplemented!(), + }; + }; + } + Sketch::UKHS(hs) => { + if ksize == 0 || ksize == hs.ksize() as usize { + match moltype { + Some(x) => { + if x.to_lowercase() == "dna" { + return true; + } else { + // TODO: draff only supports dna for now + unimplemented!() + } + } + None => unimplemented!(), + }; + } + } + }; + false + }) + .collect(); + + if good_mhs.is_empty() { + return None; + }; + + sig.signatures = good_mhs; + Some(sig) + }); + + Ok(filtered_sigs.collect()) + } +} + +impl ToWriter for Signature { + fn to_writer(&self, writer: &mut W) -> Result<(), Error> + where + W: io::Write, + { + match serde_json::to_writer(writer, &self) { + Ok(_) => Ok(()), + Err(_) => Err(SourmashError::SerdeError.into()), + } + } +} + +impl Default for Signature { + fn default() -> Signature { + Signature { + class: default_class(), + email: "".to_string(), + hash_function: "0.murmur64".to_string(), + license: default_license(), + filename: None, + name: None, + signatures: Vec::::new(), + version: default_version(), + } + } +} + +impl PartialEq for Signature { + fn eq(&self, other: &Signature) -> bool { + let metadata = self.class == other.class + && self.email == other.email + && self.hash_function == other.hash_function + && self.filename == other.filename + && self.name == other.name; + + // TODO: find the right signature + // as long as we have a matching + if let Sketch::MinHash(mh) = &self.signatures[0] { + if let Sketch::MinHash(other_mh) = &other.signatures[0] { + return metadata && (mh == other_mh); + } + } + metadata + } +} + +#[cfg(test)] +mod test { + use std::fs::File; + use std::io::BufReader; + use std::path::PathBuf; + + use super::Signature; + + #[test] + fn load_sig() { + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("tests/test-data/.sbt.v3/60f7e23c24a8d94791cc7a8680c493f9"); + + let mut reader = BufReader::new(File::open(filename).unwrap()); + let sigs = Signature::load_signatures(&mut reader, 31, Some("DNA".into()), None).unwrap(); + let _sig_data = sigs[0].clone(); + // TODO: check sig_data + } +} diff --git a/src/sketch/minhash.rs b/src/sketch/minhash.rs new file mode 100644 index 000000000..059e8a17d --- /dev/null +++ b/src/sketch/minhash.rs @@ -0,0 +1,687 @@ +use serde::de::{Deserialize, Deserializer}; +use serde::ser::{Serialize, SerializeStruct, Serializer}; +use serde_derive::Deserialize; + +use std::cmp::Ordering; +use std::collections::HashMap; +use std::iter::{Iterator, Peekable}; +use std::str; + +use failure::Error; +use lazy_static::lazy_static; + +use crate::_hash_murmur; +use crate::errors::SourmashError; +use crate::signature::SigsTrait; + +#[cfg(target_arch = "wasm32")] +use wasm_bindgen::prelude::*; + +#[cfg_attr(target_arch = "wasm32", wasm_bindgen)] +#[derive(Debug, Clone, PartialEq)] +pub struct KmerMinHash { + num: u32, + ksize: u32, + is_protein: bool, + seed: u64, + max_hash: u64, + pub(crate) mins: Vec, + pub(crate) abunds: Option>, +} + +impl Default for KmerMinHash { + fn default() -> KmerMinHash { + KmerMinHash { + num: 1000, + ksize: 21, + is_protein: false, + seed: 42, + max_hash: 0, + mins: Vec::with_capacity(1000), + abunds: None, + } + } +} + +impl Serialize for KmerMinHash { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let n_fields = match &self.abunds { + Some(_) => 8, + _ => 7, + }; + + let mut partial = serializer.serialize_struct("KmerMinHash", n_fields)?; + partial.serialize_field("num", &self.num)?; + partial.serialize_field("ksize", &self.ksize)?; + partial.serialize_field("seed", &self.seed)?; + partial.serialize_field("max_hash", &self.max_hash)?; + partial.serialize_field("mins", &self.mins)?; + partial.serialize_field("md5sum", &self.md5sum())?; + + if let Some(abunds) = &self.abunds { + partial.serialize_field("abundances", abunds)?; + } + + partial.serialize_field( + "molecule", + match &self.is_protein { + true => "protein", + false => "DNA", + }, + )?; + + partial.end() + } +} + +impl<'de> Deserialize<'de> for KmerMinHash { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + #[derive(Deserialize)] + struct TempSig { + num: u32, + ksize: u32, + seed: u64, + max_hash: u64, + //md5sum: String, + mins: Vec, + abundances: Option>, + molecule: String, + } + + let tmpsig = TempSig::deserialize(deserializer)?; + + let num = if tmpsig.max_hash != 0 { 0 } else { tmpsig.num }; + + Ok(KmerMinHash { + num, + ksize: tmpsig.ksize, + seed: tmpsig.seed, + max_hash: tmpsig.max_hash, + mins: tmpsig.mins, + abunds: tmpsig.abundances, + is_protein: match tmpsig.molecule.to_lowercase().as_ref() { + "protein" => true, + "dna" => false, + _ => unimplemented!(), + }, + }) + } +} + +impl KmerMinHash { + pub fn new( + num: u32, + ksize: u32, + is_protein: bool, + seed: u64, + max_hash: u64, + track_abundance: bool, + ) -> KmerMinHash { + let mins: Vec; + let abunds: Option>; + + if num > 0 { + mins = Vec::with_capacity(num as usize); + } else { + mins = Vec::with_capacity(1000); + } + + if track_abundance { + abunds = Some(Vec::with_capacity(mins.capacity())); + } else { + abunds = None + } + + KmerMinHash { + num, + ksize, + is_protein, + seed, + max_hash, + mins, + abunds, + } + } + + pub fn num(&self) -> u32 { + self.num + } + + pub fn is_protein(&self) -> bool { + self.is_protein + } + + pub fn seed(&self) -> u64 { + self.seed + } + + pub fn max_hash(&self) -> u64 { + self.max_hash + } + + pub fn md5sum(&self) -> String { + let mut md5_ctx = md5::Context::new(); + md5_ctx.consume(self.ksize().to_string()); + self.mins + .iter() + .map(|x| md5_ctx.consume(x.to_string())) + .count(); + format!("{:x}", md5_ctx.compute()) + } + + pub fn add_hash(&mut self, hash: u64) { + let current_max = match self.mins.last() { + Some(&x) => x, + None => u64::max_value(), + }; + + if hash <= self.max_hash || self.max_hash == 0 { + // empty? add it, if within range / no range specified. + if self.mins.is_empty() { + self.mins.push(hash); + if let Some(ref mut abunds) = self.abunds { + abunds.push(1); + } + return; + } else if hash <= self.max_hash + || current_max > hash + || (self.mins.len() as u32) < self.num + { + // "good" hash - within range, smaller than current entry, or + // still have space available + let pos = match self.mins.binary_search(&hash) { + Ok(p) => p, + Err(p) => p, + }; + + if pos == self.mins.len() { + // at end - must still be growing, we know the list won't + // get too long + self.mins.push(hash); + if let Some(ref mut abunds) = self.abunds { + abunds.push(1); + } + } else if self.mins[pos] != hash { + // didn't find hash in mins, so inserting somewhere + // in the middle; shrink list if needed. + self.mins.insert(pos, hash); + if let Some(ref mut abunds) = self.abunds { + abunds.insert(pos, 1); + } + + // is it too big now? + if self.num != 0 && self.mins.len() > (self.num as usize) { + self.mins.pop(); + if let Some(ref mut abunds) = self.abunds { + abunds.pop(); + } + } + } else if let Some(ref mut abunds) = self.abunds { + // pos == hash: hash value already in mins, inc count + abunds[pos] += 1; + } + } + } + } + + pub fn add_word(&mut self, word: &[u8]) { + let hash = _hash_murmur(word, self.seed); + self.add_hash(hash); + } + + pub fn remove_hash(&mut self, hash: u64) { + if let Ok(pos) = self.mins.binary_search(&hash) { + if self.mins[pos] == hash { + self.mins.remove(pos); + if let Some(ref mut abunds) = self.abunds { + abunds.remove(pos); + } + } + }; + } + + pub fn remove_many(&mut self, hashes: &[u64]) -> Result<(), Error> { + for min in hashes { + self.remove_hash(*min); + } + Ok(()) + } + + pub fn merge(&mut self, other: &KmerMinHash) -> Result<(), Error> { + self.check_compatible(other)?; + let max_size = self.mins.len() + other.mins.len(); + let mut merged: Vec = Vec::with_capacity(max_size); + let mut merged_abunds: Vec = Vec::with_capacity(max_size); + + { + let mut self_iter = self.mins.iter(); + let mut other_iter = other.mins.iter(); + + let mut self_abunds_iter: Option>; + if let Some(ref mut abunds) = self.abunds { + self_abunds_iter = Some(abunds.iter()); + } else { + self_abunds_iter = None; + } + + let mut other_abunds_iter: Option>; + if let Some(ref abunds) = other.abunds { + other_abunds_iter = Some(abunds.iter()); + } else { + other_abunds_iter = None; + } + + let mut self_value = self_iter.next(); + let mut other_value = other_iter.next(); + while self_value.is_some() { + let value = self_value.unwrap(); + match other_value { + None => { + merged.push(*value); + merged.extend(self_iter); + if let Some(sai) = self_abunds_iter { + merged_abunds.extend(sai); + } + break; + } + Some(x) if x < value => { + merged.push(*x); + other_value = other_iter.next(); + + if let Some(ref mut oai) = other_abunds_iter { + if let Some(v) = oai.next() { + merged_abunds.push(*v) + } + } + } + Some(x) if x == value => { + merged.push(*x); + other_value = other_iter.next(); + self_value = self_iter.next(); + + if let Some(ref mut oai) = other_abunds_iter { + if let Some(v) = oai.next() { + if let Some(ref mut sai) = self_abunds_iter { + if let Some(s) = sai.next() { + merged_abunds.push(*v + *s) + } + } + } + } + } + Some(x) if x > value => { + merged.push(*value); + self_value = self_iter.next(); + + if let Some(ref mut sai) = self_abunds_iter { + if let Some(v) = sai.next() { + merged_abunds.push(*v) + } + } + } + Some(_) => {} + } + } + if let Some(value) = other_value { + merged.push(*value); + } + merged.extend(other_iter); + if let Some(oai) = other_abunds_iter { + merged_abunds.extend(oai); + } + } + + if merged.len() < (self.num as usize) || (self.num as usize) == 0 { + self.mins = merged; + self.abunds = Some(merged_abunds); + } else { + self.mins = merged + .iter() + .map(|&x| x as u64) + .take(self.num as usize) + .collect(); + self.abunds = Some(merged_abunds) // TODO: reduce this one too + } + Ok(()) + } + + pub fn add_from(&mut self, other: &KmerMinHash) -> Result<(), Error> { + for min in &other.mins { + self.add_hash(*min); + } + Ok(()) + } + + pub fn add_many(&mut self, hashes: &[u64]) -> Result<(), Error> { + for min in hashes { + self.add_hash(*min); + } + Ok(()) + } + + pub fn add_many_with_abund(&mut self, hashes: &[(u64, u64)]) -> Result<(), Error> { + for item in hashes { + for _i in 0..item.1 { + self.add_hash(item.0); + } + } + Ok(()) + } + + pub fn count_common(&self, other: &KmerMinHash) -> Result { + self.check_compatible(other)?; + let iter = Intersection { + left: self.mins.iter().peekable(), + right: other.mins.iter().peekable(), + }; + + Ok(iter.count() as u64) + } + + pub fn intersection(&self, other: &KmerMinHash) -> Result<(Vec, u64), Error> { + self.check_compatible(other)?; + + let mut combined_mh = KmerMinHash::new( + self.num, + self.ksize, + self.is_protein, + self.seed, + self.max_hash, + self.abunds.is_some(), + ); + + combined_mh.merge(&self)?; + combined_mh.merge(&other)?; + + let it1 = Intersection { + left: self.mins.iter().peekable(), + right: other.mins.iter().peekable(), + }; + + // TODO: there is probably a way to avoid this Vec here, + // and pass the it1 as left in it2. + let i1: Vec = it1.cloned().collect(); + let it2 = Intersection { + left: i1.iter().peekable(), + right: combined_mh.mins.iter().peekable(), + }; + + let common: Vec = it2.cloned().collect(); + Ok((common, combined_mh.mins.len() as u64)) + } + + pub fn intersection_size(&self, other: &KmerMinHash) -> Result<(u64, u64), Error> { + self.check_compatible(other)?; + + let mut combined_mh = KmerMinHash::new( + self.num, + self.ksize, + self.is_protein, + self.seed, + self.max_hash, + self.abunds.is_some(), + ); + + combined_mh.merge(&self)?; + combined_mh.merge(&other)?; + + let it1 = Intersection { + left: self.mins.iter().peekable(), + right: other.mins.iter().peekable(), + }; + + // TODO: there is probably a way to avoid this Vec here, + // and pass the it1 as left in it2. + let i1: Vec = it1.cloned().collect(); + let it2 = Intersection { + left: i1.iter().peekable(), + right: combined_mh.mins.iter().peekable(), + }; + + Ok((it2.count() as u64, combined_mh.mins.len() as u64)) + } + + pub fn compare(&self, other: &KmerMinHash) -> Result { + self.check_compatible(other)?; + if let Ok((common, size)) = self.intersection_size(other) { + return Ok(common as f64 / u64::max(1, size) as f64); + } else { + return Ok(0.0); + } + } +} + +impl SigsTrait for KmerMinHash { + fn size(&self) -> usize { + self.mins.len() + } + + fn to_vec(&self) -> Vec { + self.mins.clone() + } + + fn ksize(&self) -> usize { + self.ksize as usize + } + + fn check_compatible(&self, other: &KmerMinHash) -> Result<(), Error> { + if self.ksize != other.ksize { + return Err(SourmashError::MismatchKSizes.into()); + } + if self.is_protein != other.is_protein { + return Err(SourmashError::MismatchDNAProt.into()); + } + if self.max_hash != other.max_hash { + return Err(SourmashError::MismatchMaxHash.into()); + } + if self.seed != other.seed { + return Err(SourmashError::MismatchSeed.into()); + } + Ok(()) + } + + fn add_sequence(&mut self, seq: &[u8], force: bool) -> Result<(), Error> { + let sequence: Vec = seq + .iter() + .map(|&x| (x as char).to_ascii_uppercase() as u8) + .collect(); + if sequence.len() >= (self.ksize as usize) { + if !self.is_protein { + // dna + for kmer in sequence.windows(self.ksize as usize) { + if _checkdna(kmer) { + let rc = revcomp(kmer); + if kmer < &rc { + self.add_word(kmer); + } else { + self.add_word(&rc); + } + } else if !force { + return Err(SourmashError::InvalidDNA { + message: String::from_utf8(kmer.to_vec()).unwrap(), + } + .into()); + } + } + } else { + // protein + let rc = revcomp(&sequence); + let aa_ksize = self.ksize / 3; + + for i in 0..3 { + let substr: Vec = sequence + .iter() + .cloned() + .skip(i) + .take(sequence.len() - i) + .collect(); + let aa = to_aa(&substr); + + aa.windows(aa_ksize as usize) + .map(|n| self.add_word(n)) + .count(); + + let rc_substr: Vec = + rc.iter().cloned().skip(i).take(rc.len() - i).collect(); + let aa_rc = to_aa(&rc_substr); + + aa_rc + .windows(aa_ksize as usize) + .map(|n| self.add_word(n)) + .count(); + } + } + } + Ok(()) + } +} + +struct Intersection> { + left: Peekable, + right: Peekable, +} + +impl> Iterator for Intersection { + type Item = T; + + fn next(&mut self) -> Option { + loop { + let res = match (self.left.peek(), self.right.peek()) { + (Some(ref left_key), Some(ref right_key)) => left_key.cmp(right_key), + _ => return None, + }; + + match res { + Ordering::Less => { + self.left.next(); + } + Ordering::Greater => { + self.right.next(); + } + Ordering::Equal => { + self.right.next(); + return self.left.next(); + } + } + } + } +} + +#[inline] +fn revcomp(seq: &[u8]) -> Vec { + seq.iter() + .rev() + .map(|n| match *n as char { + 'A' | 'a' => 'T', + 'T' | 't' => 'A', + 'C' | 'c' => 'G', + 'G' | 'g' => 'C', + x => x, + } as u8) // TODO: error? + .collect() +} + +lazy_static! { + static ref CODONTABLE: HashMap<&'static str, u8> = { + [ + // F + ("TTT", b'F'), ("TTC", b'F'), + // L + ("TTA", b'L'), ("TTG", b'L'), + + // S + ("TCT", b'S'), ("TCC", b'S'), ("TCA", b'S'), ("TCG", b'S'), + + // Y + ("TAT", b'Y'), ("TAC", b'Y'), + // * + ("TAA", b'*'), ("TAG", b'*'), + + // * + ("TGA", b'*'), + // C + ("TGT", b'C'), ("TGC", b'C'), + // W + ("TGG", b'W'), + + // L + ("CTT", b'L'), ("CTC", b'L'), ("CTA", b'L'), ("CTG", b'L'), + + // P + ("CCT", b'P'), ("CCC", b'P'), ("CCA", b'P'), ("CCG", b'P'), + + // H + ("CAT", b'H'), ("CAC", b'H'), + // Q + ("CAA", b'Q'), ("CAG", b'Q'), + + // R + ("CGT", b'R'), ("CGC", b'R'), ("CGA", b'R'), ("CGG", b'R'), + + // I + ("ATT", b'I'), ("ATC", b'I'), ("ATA", b'I'), + // M + ("ATG", b'M'), + + // T + ("ACT", b'T'), ("ACC", b'T'), ("ACA", b'T'), ("ACG", b'T'), + + // N + ("AAT", b'N'), ("AAC", b'N'), + // K + ("AAA", b'K'), ("AAG", b'K'), + + // S + ("AGT", b'S'), ("AGC", b'S'), + // R + ("AGA", b'R'), ("AGG", b'R'), + + // V + ("GTT", b'V'), ("GTC", b'V'), ("GTA", b'V'), ("GTG", b'V'), + + // A + ("GCT", b'A'), ("GCC", b'A'), ("GCA", b'A'), ("GCG", b'A'), + + // D + ("GAT", b'D'), ("GAC", b'D'), + // E + ("GAA", b'E'), ("GAG", b'E'), + + // G + ("GGT", b'G'), ("GGC", b'G'), ("GGA", b'G'), ("GGG", b'G'), + ].into_iter().cloned().collect() + }; +} + +#[inline] +fn to_aa(seq: &[u8]) -> Vec { + let mut converted: Vec = Vec::with_capacity(seq.len() / 3); + + for chunk in seq.chunks(3) { + if chunk.len() != 3 { + break; + } + if let Some(codon) = CODONTABLE.get(str::from_utf8(chunk).unwrap()) { + converted.push(*codon); + } + } + + converted +} + +#[inline] +fn _checkdna(seq: &[u8]) -> bool { + for n in seq.iter() { + match *n as char { + 'A' | 'a' | 'C' | 'c' | 'G' | 'g' | 'T' | 't' => (), + _ => return false, + } + } + true +} diff --git a/src/sketch/mod.rs b/src/sketch/mod.rs new file mode 100644 index 000000000..9da8dca61 --- /dev/null +++ b/src/sketch/mod.rs @@ -0,0 +1,15 @@ +pub mod minhash; +pub mod nodegraph; +pub mod ukhs; + +use serde_derive::{Deserialize, Serialize}; + +use crate::sketch::minhash::KmerMinHash; +use crate::sketch::ukhs::FlatUKHS; + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(untagged)] +pub enum Sketch { + MinHash(KmerMinHash), + UKHS(FlatUKHS), +} diff --git a/src/index/nodegraph.rs b/src/sketch/nodegraph.rs similarity index 97% rename from src/index/nodegraph.rs rename to src/sketch/nodegraph.rs index 8d589384d..9d8b52b45 100644 --- a/src/index/nodegraph.rs +++ b/src/sketch/nodegraph.rs @@ -5,12 +5,13 @@ use std::path::Path; use byteorder::{BigEndian, LittleEndian, ReadBytesExt, WriteBytesExt}; use failure::Error; use fixedbitset::FixedBitSet; +use primal; -type HashIntoType = u64; +use crate::HashIntoType; -#[derive(Debug, Default, Clone)] +#[derive(Debug, Default, Clone, PartialEq)] pub struct Nodegraph { - bs: Vec, + pub(crate) bs: Vec, ksize: usize, occupied_bins: usize, unique_kmers: usize, @@ -31,6 +32,16 @@ impl Nodegraph { } } + pub fn with_tables(tablesize: usize, n_tables: usize, ksize: usize) -> Nodegraph { + // TODO: cache the Sieve somewhere for repeated calls? + let tablesizes: Vec = primal::Primes::all() + .filter(|p| *p >= tablesize) + .take(n_tables) + .collect(); + + Nodegraph::new(&tablesizes, ksize) + } + pub fn count(&mut self, hash: HashIntoType) -> bool { let mut is_new_kmer = false; @@ -231,7 +242,7 @@ mod test { use std::path::PathBuf; use proptest::num::u64; - use proptest::{prop_assert, prop_assert_eq, prop_assume, proptest, proptest_helper}; + use proptest::{proptest, proptest_helper}; proptest! { #[test] @@ -262,16 +273,11 @@ mod test { let mut buf = Vec::new(); { let mut writer = BufWriter::new(&mut buf); - ng.save_to_writer(&mut writer); + ng.save_to_writer(&mut writer).unwrap(); } let chunk_size = 8; - for (i, (c1, c2)) in data - .to_vec() - .chunks(chunk_size) - .zip(buf.chunks(chunk_size)) - .enumerate() - { + for (c1, c2) in data.to_vec().chunks(chunk_size).zip(buf.chunks(chunk_size)) { assert_eq!(c1, c2); } } diff --git a/src/sketch/ukhs.rs b/src/sketch/ukhs.rs new file mode 100644 index 000000000..47d2b4e54 --- /dev/null +++ b/src/sketch/ukhs.rs @@ -0,0 +1,604 @@ +use std::f64::consts::PI; +use std::fs::File; +use std::hash::BuildHasherDefault; +use std::io::{BufReader, BufWriter, Read, Write}; +use std::mem; +use std::path::Path; + +use failure::Error; +use itertools::Itertools; +use pdatastructs::hyperloglog::HyperLogLog; +use serde::de::{Deserialize, Deserializer}; +use serde::ser::{Serialize, SerializeStruct, Serializer}; +use serde_derive::Deserialize; +use ukhs; + +use crate::errors::SourmashError; +use crate::index::sbt::NoHashHasher; +use crate::index::storage::ToWriter; +use crate::signature::SigsTrait; +use crate::sketch::nodegraph::Nodegraph; + +#[derive(Clone)] +pub struct UKHS { + ukhs: ukhs::UKHS, + buckets: Vec, +} + +impl<'a, T> UKHS { + pub fn buckets(&'a self) -> impl Iterator { + self.buckets.iter() + } +} + +impl UKHS { + pub fn md5sum(&self) -> String { + // TODO: review this! + let mut md5_ctx = md5::Context::new(); + md5_ctx.consume(self.ukhs.k().to_string()); + self.buckets + .iter() + .map(|x| md5_ctx.consume(x.to_string())) + .count(); + format!("{:x}", md5_ctx.compute()) + } +} + +impl std::fmt::Debug for UKHS +where + T: std::marker::Sync + std::fmt::Debug, +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "UKHS [W={}, K={}, buckets: {:?}]", + self.ukhs.w(), + self.ukhs.k(), + self.buckets + ) + } +} + +impl Default for UKHS +where + T: Sync + Default, + UKHS: UKHSTrait, +{ + fn default() -> Self { + UKHS::new(7, 21).unwrap() + } +} + +pub type HLL = HyperLogLog>; +pub type MemberUKHS = UKHS; +pub type FlatUKHS = UKHS; +pub type UniqueUKHS = UKHS; + +pub trait UKHSTrait: SigsTrait { + type Storage; + + fn new(ksize: usize, wsize: usize) -> Result, Error>; + + fn reset(&mut self); + + fn merge(&mut self, other: &Self); + + fn distance(&self, other: &Self) -> f64; + + fn to_writer(&self, writer: &mut W) -> Result<(), Error> + where + W: Write; + + fn save>(&self, path: P, _name: &str) -> Result<(), Error> { + let file = File::open(&path)?; + let mut writer = BufWriter::new(file); + + self.to_writer(&mut writer) + } + + fn load>(path: P) -> Result { + let file = File::open(&path)?; + let reader = BufReader::new(file); + + let ukhs = FlatUKHS::from_reader(reader)?; + Ok(ukhs) + } + + fn from_reader(rdr: R) -> Result + where + R: Read, + { + let ukhs = serde_json::from_reader(rdr)?; + Ok(ukhs) + } +} + +impl ToWriter for UKHS +where + UKHS: UKHSTrait, +{ + fn to_writer(&self, writer: &mut W) -> Result<(), Error> + where + W: Write, + { + as UKHSTrait>::to_writer(self, writer) + } +} + +impl UKHSTrait for UKHS { + type Storage = u64; + + fn new(ksize: usize, wsize: usize) -> Result { + let wk_ukhs = ukhs::UKHS::new(ksize, wsize)?; + let len = wk_ukhs.len(); + + Ok(UKHS { + ukhs: wk_ukhs, + buckets: vec![0; len], + }) + } + + fn reset(&mut self) { + self.buckets = vec![0; self.ukhs.len()]; + } + + fn merge(&mut self, other: &Self) { + let max_buckets = self + .buckets + .iter() + .zip(other.buckets.iter()) + //.map(|(c1, c2)| (*c1 / 2) + (*c2 / 2)) + .map(|(c1, c2)| u64::max(*c1, *c2)) + //.map(|(c1, c2)| u64::min(*c1, *c2)) + .collect(); + mem::replace(&mut self.buckets, max_buckets); + } + + fn distance(&self, other: &Self) -> f64 { + // TODO: don't iterate twice... + let prod: f64 = self + .buckets + .iter() + .zip(other.buckets.iter()) + .map(|(a, b)| (a * b) as f64) + .sum(); + let a_sq: f64 = self.buckets.iter().map(|a| (a * a) as f64).sum(); + let b_sq: f64 = other.buckets.iter().map(|a| (a * a) as f64).sum(); + + if a_sq == 0. || b_sq == 0. { + return 0.; + } + + let d = f64::min(prod / (a_sq.sqrt() * b_sq.sqrt()), 1.); + + // TODO: which distance? + // + // this is the angular distance, which is a proper metric. + 2. * d.acos() / PI + // + // this is the cosine distance as defined by scipy + //1. - d + + /* + // This is the weighted Jaccard distance + // TODO: don't iterate twice... + let mins: u64 = self + .buckets + .iter() + .zip(other.buckets.iter()) + .map(|(a, b)| u64::min(*a, *b)) + .sum(); + let maxs: u64 = self + .buckets + .iter() + .zip(other.buckets.iter()) + .map(|(a, b)| u64::max(*a, *b)) + .sum(); + + 1. - (mins as f64 / maxs as f64) + */ + } + + fn to_writer(&self, writer: &mut W) -> Result<(), Error> + where + W: Write, + { + match serde_json::to_writer(writer, &self) { + Ok(_) => Ok(()), + Err(_) => Err(SourmashError::SerdeError.into()), + } + } +} + +impl SigsTrait for UKHS { + fn size(&self) -> usize { + self.buckets.len() + } + + fn to_vec(&self) -> Vec { + self.buckets.clone() + } + + fn ksize(&self) -> usize { + // TODO: return k or w here? + self.ukhs.w() + } + + fn check_compatible(&self, _other: &Self) -> Result<(), Error> { + unimplemented!() + } + + fn add_sequence(&mut self, seq: &[u8], _force: bool) -> Result<(), Error> { + // TODO: is seq.len() > W? + let it: Vec<(u64, u64)> = self.ukhs.hash_iter_sequence(seq)?.collect(); + + /* This one update every unikmer bucket with w_hash + it.into_iter() + .map(|(_, k_hash)| { + self.buckets[self.ukhs.query_bucket(k_hash).unwrap()] += 1; + }) + .count(); + */ + + // Only update the bucket for the minimum unikmer found + for (_, group) in &it.into_iter().group_by(|(w, _)| *w) { + let (_, unikmer) = group.min().unwrap(); + self.buckets[self.ukhs.query_bucket(unikmer).unwrap()] += 1; + } + + Ok(()) + } +} + +impl UKHSTrait for UKHS { + type Storage = Nodegraph; + + fn new(ksize: usize, wsize: usize) -> Result { + let wk_ukhs = ukhs::UKHS::new(ksize, wsize)?; + let len = wk_ukhs.len(); + + Ok(UKHS { + ukhs: wk_ukhs, + buckets: vec![Nodegraph::with_tables(100_000, 4, wsize); len], + }) + } + + fn reset(&mut self) { + self.buckets = vec![Nodegraph::with_tables(100_000, 4, self.ukhs.w()); self.ukhs.len()]; + } + + fn merge(&mut self, _other: &Self) { + unimplemented!() + } + + fn distance(&self, _other: &Self) -> f64 { + unimplemented!() + } + + fn to_writer(&self, writer: &mut W) -> Result<(), Error> + where + W: Write, + { + // TODO: avoid cloning? + let flat: FlatUKHS = self.into(); + + match serde_json::to_writer(writer, &flat) { + Ok(_) => Ok(()), + Err(_) => Err(SourmashError::SerdeError.into()), + } + } +} + +impl SigsTrait for UKHS { + fn size(&self) -> usize { + self.buckets.len() + } + + fn to_vec(&self) -> Vec { + self.buckets + .iter() + .map(|b| b.unique_kmers() as u64) + .collect() + } + + fn ksize(&self) -> usize { + // TODO: return k or w here? + self.ukhs.w() + } + + fn check_compatible(&self, _other: &Self) -> Result<(), Error> { + unimplemented!() + } + + fn add_sequence(&mut self, seq: &[u8], _force: bool) -> Result<(), Error> { + let it: Vec<(u64, u64)> = self.ukhs.hash_iter_sequence(seq)?.collect(); + + /* This one update every unikmer bucket with w_hash + it.into_iter() + .map(|(w_hash, k_hash)| { + self.buckets[self.ukhs.query_bucket(k_hash).unwrap()].count(w_hash); + }) + .count(); + */ + + // Only update the bucket for the minimum unikmer found + for (w_hash, group) in &it.into_iter().group_by(|(w, _)| *w) { + let (_, unikmer) = group.min().unwrap(); + self.buckets[self.ukhs.query_bucket(unikmer).unwrap()].count(w_hash); + } + + Ok(()) + } +} + +impl From for FlatUKHS { + fn from(other: MemberUKHS) -> Self { + let buckets = other.to_vec(); // TODO: implement into_vec? + let ukhs = other.ukhs; + + FlatUKHS { ukhs, buckets } + } +} + +impl From<&MemberUKHS> for FlatUKHS { + fn from(other: &MemberUKHS) -> Self { + FlatUKHS { + ukhs: other.ukhs.clone(), + buckets: other.to_vec(), // TODO: also implement into_vec? + } + } +} + +impl UKHSTrait for UKHS { + type Storage = HLL; + + fn new(ksize: usize, wsize: usize) -> Result { + let wk_ukhs = ukhs::UKHS::new(ksize, wsize)?; + let len = wk_ukhs.len(); + + let bh = BuildHasherDefault::::default(); + + Ok(UKHS { + ukhs: wk_ukhs, + buckets: vec![HLL::with_hash(14, bh); len], // TODO: space usage is 2^b + }) + } + + fn reset(&mut self) { + let bh = BuildHasherDefault::::default(); + self.buckets = vec![HLL::with_hash(14, bh); self.ukhs.len()]; + } + + fn merge(&mut self, _other: &Self) { + unimplemented!() + } + + fn distance(&self, _other: &Self) -> f64 { + unimplemented!() + } + + fn to_writer(&self, writer: &mut W) -> Result<(), Error> + where + W: Write, + { + // TODO: avoid cloning? + let flat: FlatUKHS = self.into(); + + match serde_json::to_writer(writer, &flat) { + Ok(_) => Ok(()), + Err(_) => Err(SourmashError::SerdeError.into()), + } + } +} + +impl SigsTrait for UKHS { + fn size(&self) -> usize { + self.buckets.len() + } + + fn to_vec(&self) -> Vec { + self.buckets.iter().map(|b| b.count() as u64).collect() + } + + fn ksize(&self) -> usize { + // TODO: return k or w here? + self.ukhs.w() + } + + fn check_compatible(&self, _other: &Self) -> Result<(), Error> { + unimplemented!() + } + + fn add_sequence(&mut self, seq: &[u8], _force: bool) -> Result<(), Error> { + let it: Vec<(u64, u64)> = self.ukhs.hash_iter_sequence(seq)?.collect(); + + /* This one update every unikmer bucket with w_hash + it.into_iter() + .map(|(w_hash, k_hash)| { + self.buckets[self.ukhs.query_bucket(k_hash).unwrap()].add(&w_hash); + }) + .count(); + */ + + // Only update the bucket for the minimum unikmer found + for (w_hash, group) in &it.into_iter().group_by(|(w, _)| *w) { + let (_, unikmer) = group.min().unwrap(); + self.buckets[self.ukhs.query_bucket(unikmer).unwrap()].add(&w_hash); + } + + Ok(()) + } +} + +impl From for FlatUKHS { + fn from(other: UniqueUKHS) -> Self { + let buckets = other.to_vec(); // TODO: implement into_vec? + let ukhs = other.ukhs; + + FlatUKHS { ukhs, buckets } + } +} + +impl From<&UniqueUKHS> for FlatUKHS { + fn from(other: &UniqueUKHS) -> Self { + FlatUKHS { + ukhs: other.ukhs.clone(), + buckets: other.to_vec(), // TODO: also implement into_vec? + } + } +} + +impl Serialize for UKHS { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let n_fields = 5; + + let buckets = self.buckets.to_vec(); + + let mut partial = serializer.serialize_struct("UKHS", n_fields)?; + partial.serialize_field("signature", &buckets)?; + partial.serialize_field("W", &self.ukhs.w())?; + partial.serialize_field("K", &self.ukhs.k())?; + partial.serialize_field("size", &self.buckets.len())?; + partial.serialize_field("name", "")?; + + // TODO: properly set name + //partial.serialize_field("name", &self.name)?; + + partial.end() + } +} + +impl<'de> Deserialize<'de> for UKHS { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + #[derive(Deserialize)] + struct TempUKHS { + signature: Vec, + #[serde(rename = "K")] + k: usize, + #[serde(rename = "W")] + w: usize, + //size: usize, + //name: String, + } + + let tmpukhs = TempUKHS::deserialize(deserializer)?; + + //TODO: remove this unwrap, need to map Failure error to serde error? + let mut u = UKHS::::new(tmpukhs.k, tmpukhs.w).unwrap(); + + u.buckets = tmpukhs.signature; + + //TODO: what to do with name? + + Ok(u) + } +} + +impl PartialEq for UKHS +where + T: PartialEq, +{ + fn eq(&self, other: &UKHS) -> bool { + self.buckets + .iter() + .zip(other.buckets.iter()) + .all(|(b1, b2)| b1 == b2) + && self.ukhs.w() == other.ukhs.w() + && self.ukhs.k() == self.ukhs.k() + } +} + +// Removed this for now, because calling .into() in these doesn't +// transfer all the important information... +/* +impl From for Dataset { + fn from(other: FlatUKHS) -> Dataset { + let data = Lazy::new(); + data.get_or_create(|| other.into()); + + Dataset::builder() + .data(Rc::new(data)) + .filename("") + .name("") + .metadata("") + .storage(None) + .build() + } +} + +impl From for Signature { + fn from(other: FlatUKHS) -> Signature { + Signature::builder() + .hash_function("nthash") // TODO: spec! + .class("draff_signature") // TODO: spec! + .name(Some("draff_file".into())) // TODO: spec! + .signatures(vec![Sketch::UKHS(other)]) + .build() + } +} +*/ + +#[cfg(test)] +mod test { + use std::path::PathBuf; + + use bio::io::fasta::Reader; + use ocf::get_input; + + use super::{FlatUKHS, MemberUKHS, UKHSTrait}; + use crate::signature::SigsTrait; + + #[test] + fn ukhs_add_sequence() { + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("tests/test-data/ecoli.genes.fna"); + + let mut ukhs = MemberUKHS::new(9, 21).unwrap(); + + let (input, _) = get_input(filename.to_str().unwrap()).unwrap(); + let reader = Reader::new(input); + + for record in reader.records() { + let record = record.unwrap(); + ukhs.add_sequence(record.seq(), false).unwrap(); + } + + // TODO: find test case... + //assert_eq!(ukhs.to_vec(), [1, 2, 3]); + } + + #[test] + fn ukhs_writer_reader() { + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("tests/test-data/ecoli.genes.fna"); + + let mut ukhs = FlatUKHS::new(9, 21).unwrap(); + + let (input, _) = get_input(filename.to_str().unwrap()).unwrap(); + let reader = Reader::new(input); + + for record in reader.records() { + let record = record.unwrap(); + ukhs.add_sequence(record.seq(), false).unwrap(); + } + + let mut buffer = Vec::new(); + ukhs.to_writer(&mut buffer).unwrap(); + + match FlatUKHS::from_reader(&buffer[..]) { + Ok(new_ukhs) => { + assert_eq!(ukhs.buckets, new_ukhs.buckets); + } + Err(e) => { + dbg!(e); + assert_eq!(1, 0); + } + } + } +} diff --git a/tests/finch.rs b/tests/finch.rs index 64555d29f..8b1378917 100644 --- a/tests/finch.rs +++ b/tests/finch.rs @@ -1,88 +1 @@ -#[cfg(feature = "from-finch")] -extern crate finch; -#[cfg(feature = "from-finch")] -extern crate needletail; - -use std::collections::HashMap; -use std::collections::HashSet; -use std::iter::FromIterator; - -use sourmash::KmerMinHash; - -#[cfg(feature = "from-finch")] -use finch::minhashes::MinHashKmers; - -#[cfg(feature = "from-finch")] -use needletail::kmer::canonical; - -#[cfg(feature = "from-finch")] -#[test] -fn finch_behavior() { - let mut a = KmerMinHash::new(20, 10, false, 42, 0, true); - let mut b = MinHashKmers::new(20, 42); - - let seq = b"TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA"; - - a.add_sequence(seq, false); - - for kmer in seq.windows(10) { - b.push(&canonical(kmer), 0); - } - - let b_hashes = b.into_vec(); - - let s1: HashSet<_> = HashSet::from_iter(a.mins.iter().map(|x| *x)); - let s2: HashSet<_> = HashSet::from_iter(b_hashes.iter().map(|x| x.hash as u64)); - let i1 = &s1 & &s2; - - assert!(i1.len() == a.mins.len()); - assert!(i1.len() == b_hashes.len()); - - if let Some(abunds) = a.abunds { - let smap: HashMap<_, _> = HashMap::from_iter(a.mins.iter().zip(abunds.iter())); - println!("{:?}", smap); - for item in b_hashes.iter() { - assert!(smap.contains_key(&(item.hash as u64))); - assert!( - **smap.get(&(item.hash as u64)).unwrap() - == ((item.count + item.extra_count) as u64) - ); - } - } -} - -#[cfg(feature = "from-finch")] -#[test] -fn from_finch() { - let mut a = KmerMinHash::new(20, 10, false, 42, 0, true); - let mut b = MinHashKmers::new(20, 42); - - let seq = b"TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA"; - - a.add_sequence(seq, false); - - for kmer in seq.windows(10) { - b.push(&canonical(kmer), 0); - } - - let c = KmerMinHash::from(b); - - let s1: HashSet<_> = HashSet::from_iter(a.mins.iter().map(|x| *x)); - let s2: HashSet<_> = HashSet::from_iter(c.mins.iter().map(|x| *x)); - let i1 = &s1 & &s2; - - assert!(i1.len() == a.mins.len()); - assert!(i1.len() == c.mins.len()); - - if let Some(a_abunds) = a.abunds { - if let Some(c_abunds) = c.abunds { - let a_smap: HashMap<_, _> = HashMap::from_iter(a.mins.iter().zip(a_abunds.iter())); - let c_smap: HashMap<_, _> = HashMap::from_iter(c.mins.iter().zip(c_abunds.iter())); - for item in a_smap.iter() { - assert!(c_smap.contains_key(*item.0)); - assert!(c_smap.get(*item.0).unwrap() == item.1); - } - } - } -} diff --git a/tests/minhash.rs b/tests/minhash.rs index 0193e6f03..717df8182 100644 --- a/tests/minhash.rs +++ b/tests/minhash.rs @@ -1,4 +1,5 @@ -use sourmash::KmerMinHash; +use sourmash::signature::SigsTrait; +use sourmash::sketch::minhash::KmerMinHash; #[test] fn throws_error() { @@ -15,13 +16,13 @@ fn merge() { let mut a = KmerMinHash::new(20, 10, false, 42, 0, false); let mut b = KmerMinHash::new(20, 10, false, 42, 0, false); - a.add_sequence(b"TGCCGCCCAGCA", false); - b.add_sequence(b"TGCCGCCCAGCA", false); + a.add_sequence(b"TGCCGCCCAGCA", false).unwrap(); + b.add_sequence(b"TGCCGCCCAGCA", false).unwrap(); - a.add_sequence(b"GTCCGCCCAGTGA", false); - b.add_sequence(b"GTCCGCCCAGTGG", false); + a.add_sequence(b"GTCCGCCCAGTGA", false).unwrap(); + b.add_sequence(b"GTCCGCCCAGTGG", false).unwrap(); - a.merge(&b); + a.merge(&b).unwrap(); assert_eq!( a.to_vec(), vec![ @@ -42,20 +43,24 @@ fn compare() { let mut a = KmerMinHash::new(20, 10, false, 42, 0, false); let mut b = KmerMinHash::new(20, 10, false, 42, 0, false); - a.add_sequence(b"TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA", false); - b.add_sequence(b"TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA", false); + a.add_sequence(b"TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA", false) + .unwrap(); + b.add_sequence(b"TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA", false) + .unwrap(); assert_eq!(a.compare(&b).unwrap(), 1.0); // assert_eq!(b.compare(&b).unwrap(), 1.0); assert_eq!(b.compare(&a).unwrap(), 1.0); // assert_eq!(a.compare(&a).unwrap(), 1.0); - b.add_sequence(b"TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA", false); + b.add_sequence(b"TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA", false) + .unwrap(); assert_eq!(a.compare(&b).unwrap(), 1.0); // assert_eq!(b.compare(&b).unwrap(), 1.0); assert_eq!(b.compare(&a).unwrap(), 1.0); // assert_eq!(a.compare(&a).unwrap(), 1.0); - b.add_sequence(b"GATTGGTGCACACTTAACTGGGTGCCGCGCTGGTGCTGATCCATGAAGTT", false); + b.add_sequence(b"GATTGGTGCACACTTAACTGGGTGCCGCGCTGGTGCTGATCCATGAAGTT", false) + .unwrap(); assert!(a.compare(&b).unwrap() >= 0.3); assert!(b.compare(&a).unwrap() >= 0.3); } diff --git a/tests/signature.rs b/tests/signature.rs index c6d8aed37..99ff889c2 100644 --- a/tests/signature.rs +++ b/tests/signature.rs @@ -4,7 +4,7 @@ use std::fs::File; use std::io::BufReader; use std::path::PathBuf; -use sourmash::{KmerMinHash, Signature}; +use sourmash::signature::Signature; #[test] fn load_signature() { diff --git a/tests/test-data/v4.sbt.json b/tests/test-data/v4.sbt.json new file mode 100644 index 000000000..763c8756d --- /dev/null +++ b/tests/test-data/v4.sbt.json @@ -0,0 +1 @@ +{"d":2,"version":4,"storage":{"backend":"FSStorage","args":{"path":".sbt.v3"}},"factory":{"class":"GraphFactory","args":[1,100000.0,4]},"nodes":{"0":{"filename":"internal.0","name":"internal.0","metadata":{"min_n_below":500}},"1":{"filename":"internal.1","name":"internal.1","metadata":{"min_n_below":500}},"2":{"filename":"internal.2","name":"internal.2","metadata":{"min_n_below":500}},"3":{"filename":"internal.3","name":"internal.3","metadata":{"min_n_below":500}},"4":{"filename":"internal.4","name":"internal.4","metadata":{"min_n_below":500}},"5":{"filename":"internal.5","name":"internal.5","metadata":{"min_n_below":500}},"6":{"filename":"6d6e87e1154e95b279e5e7db414bc37b","name":"6d6e87e1154e95b279e5e7db414bc37b","metadata":"6d6e87e1154e95b279e5e7db414bc37b"},"7":{"filename":"60f7e23c24a8d94791cc7a8680c493f9","name":"60f7e23c24a8d94791cc7a8680c493f9","metadata":"60f7e23c24a8d94791cc7a8680c493f9"},"8":{"filename":"0107d767a345eff67ecdaed2ee5cd7ba","name":"0107d767a345eff67ecdaed2ee5cd7ba","metadata":"0107d767a345eff67ecdaed2ee5cd7ba"},"9":{"filename":"f71e78178af9e45e6f1d87a0c53c465c","name":"f71e78178af9e45e6f1d87a0c53c465c","metadata":"f71e78178af9e45e6f1d87a0c53c465c"},"10":{"filename":"f0c834bc306651d2b9321fb21d3e8d8f","name":"f0c834bc306651d2b9321fb21d3e8d8f","metadata":"f0c834bc306651d2b9321fb21d3e8d8f"},"11":{"filename":"4e94e60265e04f0763142e20b52c0da1","name":"4e94e60265e04f0763142e20b52c0da1","metadata":"4e94e60265e04f0763142e20b52c0da1"},"12":{"filename":"b59473c94ff2889eca5d7165936e64b3","name":"b59473c94ff2889eca5d7165936e64b3","metadata":"b59473c94ff2889eca5d7165936e64b3"}}}