Skip to content

Commit

Permalink
Big refactor on the rust codebase (#692)
Browse files Browse the repository at this point in the history
save semi-working
initial bigsi
Rename Leaf to Dataset
remove storage from readdata trait
basic search working
start best_only
UKHS save and load, expose draff command
move ffi to mod dir
split ffi functions
initial draff command
keeping track of unique kmers with a Nodegraph
Fix overcounting, add HLL storage
make tests deterministic
move Signature from root to submodule
move KmerMinHash into submodule
fix finch
make index consistent with other submodules
starting support for multiple signature types
move add_sequence and check_compatible to SigsTrait
use unimplemented if something is missing
initial docs and warnings cleanup
move sbt into submodule
reorganize sbt module
insertion working for SBT
Replace derive_builder with typed_builder
using fastx parser from rust-bio
search saves output
Rename Signatures enum to Sketch
Move nodegraph into sketch
Storage args are a proper enum now
explicit sig and dataset conversion
Use syntax inspired by czbiohub/extract_kmers for codon table (original commit: https://github.com/czbiohub/extract_kmers/blob/d036145000bff96454ec383b193f8913fc5e2b16/src/codon_table.rs)
demo command for counting unique hashes in an SBT
implement sbt v4 parsing, and clean up clippy warnings
Start replacing the Factory struct with an enum
move smrs bin to src/bin
split draff commands from smrs
fix v4 loading
  • Loading branch information
luizirber committed Jul 7, 2019
1 parent 534d269 commit 0d69b79
Show file tree
Hide file tree
Showing 36 changed files with 5,160 additions and 2,627 deletions.
18 changes: 10 additions & 8 deletions Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "sourmash"
version = "0.2.3"
version = "0.3.0"
authors = ["Luiz Irber <luiz.irber@gmail.com>"]
description = "MinHash sketches for genomic data"
repository = "https://github.com/luizirber/sourmash-rust"
Expand All @@ -17,11 +17,6 @@ bench = false
[profile.release]
lto=true

[[bin]]
bench = false
path = "src/main.rs"
name = "smrs"

[features]
from-finch = ["finch", "needletail"]

Expand All @@ -31,11 +26,9 @@ from-finch = ["finch", "needletail"]
#cbindgen = "~0.6.7"

[dependencies]
backtrace = "=0.3.9" # https://github.com/alexcrichton/backtrace-rs/issues/147
byteorder = "^1.2"
cfg-if = "0.1"
clap = { version = "~2.32", features = ["yaml"] }
derive_builder = "^0.7"
env_logger = "0.6.0"
exitfailure = "0.5.1"
failure = "0.1.3"
Expand All @@ -52,6 +45,13 @@ needletail = { version = "~0.2.1", optional = true }
serde = "1.0"
serde_derive = "~1.0.58"
serde_json = "1.0.2"
ukhs = "0.3.4"
bio = { git = "https://github.com/luizirber/rust-bio", branch = "feature/fastx_reader" }
primal = "0.2.3"
pdatastructs = "0.5.0"
itertools = "0.8.0"
typed-builder = "0.3.0"
csv = "1.0.7"

[target.'cfg(target_arch = "wasm32")'.dependencies.wasm-bindgen]
version = "^0.2"
Expand All @@ -67,6 +67,8 @@ features = ["bz2"]
proptest = "^0.8"
criterion = "^0.2"
rand = "^0.5"
tempfile = "3"
assert_matches = "1.2"

[[bench]]
name = "index"
Expand Down
95 changes: 57 additions & 38 deletions benches/index.rs
Expand Up @@ -4,79 +4,98 @@ extern crate criterion;
use std::path::PathBuf;

use criterion::{Bencher, Criterion, Fun};
use sourmash::index::linear::LinearIndexBuilder;
use sourmash::index::nodegraph::Nodegraph;
use sourmash::index::sbt::{Node, MHBT, SBT};
use sourmash::index::search::search_minhashes;
use sourmash::index::{Index, Leaf};
use sourmash::Signature;
use sourmash::index::bigsi::BIGSI;
use sourmash::index::linear::LinearIndex;
use sourmash::index::storage::ReadData;
use sourmash::index::MHBT;
use sourmash::index::{Dataset, Index};
use sourmash::signature::Signature;

fn find_small_bench(c: &mut Criterion) {
let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
filename.push("tests/test-data/v5.sbt.json");

let sbt: MHBT = SBT::from_path(filename).expect("Loading error");
let sbt = MHBT::from_path(filename).expect("Loading error");

let leaf: Leaf<Signature> = (*sbt.leaves().first().unwrap()).clone();
let leaf: Dataset<Signature> = (*sbt.datasets().first().unwrap()).clone();

let mut linear = LinearIndexBuilder::default()
.storage(sbt.storage())
.build()
.unwrap();
for l in &sbt.leaves() {
let mut linear = LinearIndex::builder().storage(sbt.storage()).build();

for l in &sbt.datasets() {
linear.insert(l);
}

let mut bigsi = BIGSI::new(10000, 10);
for l in &sbt.datasets() {
let data = l.data().unwrap();
bigsi.insert(data);
}

let sbt_find = Fun::new(
"sbt_find",
move |b: &mut Bencher, leaf: &Leaf<Signature>| {
b.iter(|| sbt.find(search_minhashes, leaf, 0.1))
},
"sbt_search",
move |b: &mut Bencher, leaf: &Dataset<Signature>| b.iter(|| sbt.search(leaf, 0.1, false)),
);

let linear_find = Fun::new(
"linear_find",
move |b: &mut Bencher, leaf: &Leaf<Signature>| {
b.iter(|| linear.find(search_minhashes, leaf, 0.1))
"linear_search",
move |b: &mut Bencher, leaf: &Dataset<Signature>| {
b.iter(|| linear.search(leaf, 0.1, false))
},
);

let bigsi_find = Fun::new(
"bigsi_search",
move |b: &mut Bencher, leaf: &Dataset<Signature>| {
let data = leaf.data().unwrap();
b.iter(|| bigsi.search(data, 0.1, false))
},
);

let functions = vec![sbt_find, linear_find];
c.bench_functions("find_small", functions, leaf);
let functions = vec![sbt_find, linear_find, bigsi_find];
c.bench_functions("search_small", functions, leaf);
}

fn find_subset_bench(c: &mut Criterion) {
let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
filename.push("tests/test-data/subset.sbt.json");

let sbt: MHBT = SBT::from_path(filename).expect("Loading error");
let sbt = MHBT::from_path(filename).expect("Loading error");

let leaf: Leaf<Signature> = (*sbt.leaves().first().unwrap()).clone();
let leaf: Dataset<Signature> = (*sbt.datasets().first().unwrap()).clone();

let mut linear = LinearIndexBuilder::default()
.storage(sbt.storage())
.build()
.unwrap();
for l in &sbt.leaves() {
let mut linear = LinearIndex::builder().storage(sbt.storage()).build();
for l in &sbt.datasets() {
linear.insert(l);
}

let mut bigsi = BIGSI::new(10000, 10);
for l in &sbt.datasets() {
let data = l.data().unwrap();
bigsi.insert(data);
}

let sbt_find = Fun::new(
"sbt_find",
move |b: &mut Bencher, leaf: &Leaf<Signature>| {
b.iter(|| sbt.find(search_minhashes, leaf, 0.1))
},
"sbt_search",
move |b: &mut Bencher, leaf: &Dataset<Signature>| b.iter(|| sbt.search(leaf, 0.1, false)),
);

let linear_find = Fun::new(
"linear_find",
move |b: &mut Bencher, leaf: &Leaf<Signature>| {
b.iter(|| linear.find(search_minhashes, leaf, 0.1))
"linear_search",
move |b: &mut Bencher, leaf: &Dataset<Signature>| {
b.iter(|| linear.search(leaf, 0.1, false))
},
);

let bigsi_find = Fun::new(
"bigsi_search",
move |b: &mut Bencher, leaf: &Dataset<Signature>| {
let data = leaf.data().unwrap();
b.iter(|| bigsi.search(data, 0.1, false))
},
);

let functions = vec![sbt_find, linear_find];
c.bench_functions("find_subset", functions, leaf);
let functions = vec![sbt_find, linear_find, bigsi_find];
c.bench_functions("search_subset", functions, leaf);
}

criterion_group!(benches, find_small_bench, find_subset_bench);
Expand Down
6 changes: 6 additions & 0 deletions include/sourmash.h
Expand Up @@ -51,6 +51,12 @@ void kmerminhash_add_sequence(KmerMinHash *ptr, const char *sequence, bool force

void kmerminhash_add_word(KmerMinHash *ptr, const char *word);

void kmerminhash_remove_hash(KmerMinHash *ptr, uint64_t h);

void kmerminhash_remove_many(KmerMinHash *ptr,
const uint64_t *hashes_ptr,
uintptr_t insize);

double kmerminhash_compare(KmerMinHash *ptr, const KmerMinHash *other);

uint64_t kmerminhash_count_common(KmerMinHash *ptr, const KmerMinHash *other);
Expand Down
14 changes: 8 additions & 6 deletions ocf/src/lib.rs
Expand Up @@ -87,7 +87,8 @@ pub fn get_readable(input_name: &str) -> Box<dyn io::Read> {
match input_name {
"-" => Box::new(BufReader::new(io::stdin())),
_ => Box::new(BufReader::new(
File::open(input_name).expect(&format!("Can't open input file {}", input_name)),
File::open(input_name)
.unwrap_or_else(|_| panic!("Can't open input file {}", input_name)),
)),
}
}
Expand All @@ -100,16 +101,16 @@ fn get_compression(mut in_stream: Box<dyn io::Read>) -> CompressionFormat {
.expect("Error durring reading first bit of file");

let mut five_bit_val: u64 = 0;
for i in 0..5 {
five_bit_val |= (buf[i] as u64) << 8 * (4 - i);
for (i, item) in buf.iter().enumerate().take(5) {
five_bit_val |= (u64::from(*item)) << (8 * (4 - i));
}
if CompressionFormat::from_u64(five_bit_val) == Some(CompressionFormat::Lzma) {
return CompressionFormat::Lzma;
}

let mut two_bit_val: u64 = 0;
for i in 0..2 {
two_bit_val |= (buf[i] as u64) << 8 * (1 - i);
for (i, item) in buf.iter().enumerate().take(2) {
two_bit_val |= (u64::from(*item)) << (8 * (1 - i));
}

match CompressionFormat::from_u64(two_bit_val) {
Expand Down Expand Up @@ -213,7 +214,8 @@ fn get_writable(output_name: &str) -> Box<dyn io::Write> {
match output_name {
"-" => Box::new(BufWriter::new(io::stdout())),
_ => Box::new(BufWriter::new(
File::create(output_name).expect(&format!("Can't open output file {}", output_name)),
File::create(output_name)
.unwrap_or_else(|_| panic!("Can't open output file {}", output_name)),
)),
}
}
Expand Down
60 changes: 60 additions & 0 deletions src/bin/draff.rs
@@ -0,0 +1,60 @@
use clap::{load_yaml, App};
use exitfailure::ExitFailure;
//use human_panic::setup_panic;
use sourmash::cmd::{draff_compare, draff_index, draff_search, draff_signature};

fn main() -> Result<(), ExitFailure> {
//setup_panic!();

env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init();

let yml = load_yaml!("draff.yml");
let m = App::from_yaml(yml).get_matches();

match m.subcommand_name() {
Some("compute") => {
let cmd = m.subcommand_matches("compute").unwrap();
let inputs = cmd
.values_of("inputs")
.map(|vals| vals.collect::<Vec<_>>())
.unwrap();

let ksize: usize = cmd.value_of("ksize").unwrap().parse().unwrap();
let wsize: usize = cmd.value_of("wsize").unwrap().parse().unwrap();

draff_signature(inputs, ksize, wsize)?;
}
Some("search") => {
let cmd = m.subcommand_matches("search").unwrap();

let index: &str = cmd.value_of("index").unwrap();
let query: &str = cmd.value_of("query").unwrap();

draff_search(index, query)?;
}
Some("compare") => {
let cmd = m.subcommand_matches("compare").unwrap();
let inputs = cmd
.values_of("inputs")
.map(|vals| vals.collect::<Vec<_>>())
.unwrap();

draff_compare(inputs)?;
}
Some("index") => {
let cmd = m.subcommand_matches("index").unwrap();
let inputs = cmd
.values_of("inputs")
.map(|vals| vals.collect::<Vec<_>>())
.unwrap();

let output: &str = cmd.value_of("output").unwrap();

draff_index(inputs, output)?;
}
_ => {
println!("{:?}", m);
}
}
Ok(())
}
66 changes: 66 additions & 0 deletions src/bin/draff.yml
@@ -0,0 +1,66 @@
name: draff
version: "0.0.1"
about: "draff signature commands"
author: Luiz Irber <sourmash@luizirber.org>

settings:
- SubcommandRequiredElseHelp

subcommands:
- compute:
about: create a draff signature
settings:
- ArgRequiredElseHelp
args:
- ksize:
help: ksize
short: K
default_value: "9"
takes_value: true
required: false
- wsize:
help: window size
short: W
default_value: "31"
takes_value: true
required: false
- output:
help: alternative output file
short: o
takes_value: true
required: false
- inputs:
help: FASTA files
multiple: true
- index:
about: create a draff index
settings:
- ArgRequiredElseHelp
args:
- output:
help: alternative output file
short: o
takes_value: true
required: false
- inputs:
help: draff signatures
multiple: true
- compare:
about: compare draff signatures
settings:
- ArgRequiredElseHelp
args:
- inputs:
help: draff signatures
multiple: true
- search:
about: search a draff index
settings:
- ArgRequiredElseHelp
args:
- index:
help: index to search
required: true
- query:
help: draff signature to search
required: true

0 comments on commit 0d69b79

Please sign in to comment.