Skip to content

Commit

Permalink
MRG: Calculate all gather stats in rust; use for rocksdb gather (#2943)
Browse files Browse the repository at this point in the history
This PR adds a `calculate_gather_stats` function (and criterion
benchmark) to calculate all gather statistics directly in rust. It makes
use of this function in `disk_revindex.rs`, meaning that gather on a
rocksdb database will now produce the full `GatherResult`.

Note that this PR does not change the python layer to use these stats.
This is just a step on the way to that :).

## New functions:

- `Sketch::MinHash::sum_abunds()` - sum abundances in a MinHash
- `Sketch::MinHash::n_unique_kmers()` - len(self) * self.scaled.
- `Sketch::inflate(abunds_from)` - inflate abundances in self using
abund values from `abunds_from` MinHash
- `Sketch::inflated_abundances(abunds_from)` - same process as
`inflate`, but just return the abundance vector and sum of abundances.
- `ani_utils::ani_from_containment` - a streamlined function to get ANI
from containment. I suspect a lot of the time added for ANI calculation
(ref: testing over in branchwater) is the way we calculate it in python
(including calculating a total number of k-mers). This excludes anything
not really needed for basic ANI without confidence intervals.
- `ani_utils::ani_from_containment_ci` - get high and low CI ANI from
containment
- `index::calculate_gather_stats` - calculate all gather statistics that
can be separated from the gather iteration and return the full
`GatherResult`

The "expensive" calculations:
- abundance-weighted values require re-calculating intersections and/or
manipulating the abundance vector
> Thankfully I think the new `inflate`/`inflated_abundances` code is
pretty efficient? It uses the `itertools` `merge_join_by` strategy from
https://github.com/sourmash-bio/sourmash/compare/lirber/itertools_merge.
Thanks @luizirber :)
- abundance `median` and `stddev` currently require cloning abunds, not
sure how big of an issue that is.
- ANI with confidence intervals

## Punted Issues
- #3020: calculate all gather stats for all other database types. start
with `linear.rs` since it uses `GatherResult` struct already.
- #3021 try moving `calculate_gather_stats` outside of the `gather`
functions to enable postprocessing in parallel.
- #3022 : write ffi for the new functions?
  - `sketch::minhash::sum_abunds`
  - `sketch::minhash::n_unique_kmers`,
  - `sketch::minhash::inflate`
  - `sketch::minhash::inflated_abundances`
  - `ani_utils::ani_from_containment`
  - `ani_utils::ani_from_containment_ci`
  - `ani_utils::prob_nothing_in_common`

ref
sourmash-bio/sourmash_plugin_branchwater#187.

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: Mohamed Abuelanin <mabuelanin@gmail.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Luiz Irber <luizirber@users.noreply.github.com>
Co-authored-by: Luiz Irber <contact+github@luizirber.org>
  • Loading branch information
5 people committed Feb 21, 2024
1 parent fa4ae0b commit 297ff0b
Show file tree
Hide file tree
Showing 18 changed files with 1,173 additions and 87 deletions.
160 changes: 160 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions include/sourmash.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ enum SourmashErrorCode {
SOURMASH_ERROR_CODE_READ_DATA = 1201,
SOURMASH_ERROR_CODE_STORAGE = 1202,
SOURMASH_ERROR_CODE_HLL_PRECISION_BOUNDS = 1301,
SOURMASH_ERROR_CODE_ANI_ESTIMATION_ERROR = 1401,
SOURMASH_ERROR_CODE_IO = 100001,
SOURMASH_ERROR_CODE_UTF8_ERROR = 100002,
SOURMASH_ERROR_CODE_PARSE_INT = 100003,
Expand Down
12 changes: 10 additions & 2 deletions src/core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ fixedbitset = "0.4.0"
getrandom = { version = "0.2", features = ["js"] }
getset = "0.1.1"
histogram = "0.9.1"
itertools = "0.12.0"
log = "0.4.20"
md5 = "0.7.0"
memmap2 = "0.9.4"
Expand All @@ -49,11 +50,14 @@ once_cell = "1.18.0"
ouroboros = "0.18.3"
piz = "0.5.0"
primal-check = "0.3.1"
rayon = { version = "1.8.1", optional = true }
rkyv = { version = "0.7.44", optional = true }
roaring = "0.10.3"
rayon = { version = "1.8.1", optional = true }
roots = "0.0.8"
serde = { version = "1.0.196", features = ["derive"] }
serde_json = "1.0.113"
statrs = "0.16.0"
streaming-stats = "0.2.3"
thiserror = "1.0"
twox-hash = "1.6.0"
typed-builder = "0.18.0"
Expand All @@ -78,6 +82,10 @@ harness = false
name = "minhash"
harness = false

[[bench]]
name = "gather"
harness = false

[package.metadata.cargo-all-features]
skip_optional_dependencies = true
denylist = ["maturin"]
Expand All @@ -96,7 +104,7 @@ version = "0.3.68"
features = ["console", "File"]

[target.'cfg(all(target_arch = "wasm32"))'.dependencies.chrono]
version = "0.4.34"
version = "0.4.32"
features = ["wasmbind"]

[target.'cfg(all(target_arch = "wasm32", target_os="unknown"))'.dev-dependencies]
Expand Down
60 changes: 60 additions & 0 deletions src/core/benches/gather.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
use std::fs::File;
use std::io::BufReader;
use std::path::PathBuf;

use sourmash::collection::Collection;
use sourmash::signature::Signature;
use sourmash::sketch::Sketch;
use sourmash::{index::calculate_gather_stats, storage::SigStore};

use criterion::{black_box, criterion_group, criterion_main, Criterion};

fn gather_stats_benchmarks(c: &mut Criterion) {
let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
filename.push("../../tests/test-data/track_abund/47.fa.sig");
let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let sig = Signature::from_reader(reader)
.expect("Loading error")
.swap_remove(0);
let orig_query = sig.minhash().unwrap();
let query = orig_query.clone();
let total_weighted_hashes = orig_query.sum_abunds();

let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
filename.push("../../tests/test-data/track_abund/63.fa.sig");
// load collection to get sig in sigstore
let signatures = Signature::from_path(filename).expect("cant find file");
let collection = Collection::from_sigs(signatures).expect("cant make collection");
let match_sig: SigStore = collection.sig_for_dataset(0).expect("cant load sig");
let test_cases = vec![(false, false), (true, false), (false, true), (true, true)];

let mut group = c.benchmark_group("gather_stats");
for (calc_abund_stats, calc_ani_ci) in test_cases {
let test_name = format!(
"abund{}_ani_ci{}",
calc_abund_stats as u8, calc_ani_ci as u8
);
group.bench_function(&test_name, |b| {
b.iter(|| {
calculate_gather_stats(
black_box(&orig_query),
black_box(query.clone()),
black_box(match_sig.clone()),
black_box(42), // Example match_size
black_box(1), // Example gather_result_rank
black_box(200),
black_box(total_weighted_hashes.try_into().unwrap()),
black_box(calc_abund_stats),
black_box(calc_ani_ci),
black_box(None), // don't set custom confidence intervals
)
.expect("error calculating gather stats");
});
});
}
group.finish();
}

criterion_group!(gather, gather_stats_benchmarks);
criterion_main!(gather);

0 comments on commit 297ff0b

Please sign in to comment.