Skip to content

Commit

Permalink
MRG: safer ksize selection while still accommodating k=k*3 (#3028)
Browse files Browse the repository at this point in the history
Check signature `.hash_function` to determine whether or not we need to
make protein ksize corrections.

* Fixes #3026
  • Loading branch information
bluegenes committed Feb 23, 2024
1 parent b14ef5d commit 6db763c
Show file tree
Hide file tree
Showing 2 changed files with 132 additions and 6 deletions.
49 changes: 44 additions & 5 deletions src/core/src/manifest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ impl Record {
pub fn from_sig(sig: &Signature, path: &str) -> Vec<Self> {
sig.iter()
.map(|sketch| {
let (mut ksize, md5, with_abundance, moltype, n_hashes, num, scaled) = match sketch
let (mut ksize, md5, with_abundance, moltype, n_hashes, num, scaled, hash_function) = match sketch
{
Sketch::MinHash(mh) => (
mh.ksize() as u32,
Expand All @@ -92,6 +92,7 @@ impl Record {
mh.size(),
mh.num(),
mh.scaled(),
mh.hash_function(),
),
Sketch::LargeMinHash(mh) => (
mh.ksize() as u32,
Expand All @@ -101,15 +102,17 @@ impl Record {
mh.size(),
mh.num(),
mh.scaled(),
mh.hash_function(),
),
_ => unimplemented!(),
};

let md5short = md5[0..8].into();

if moltype != HashFunctions::Murmur64Dna {
ksize /= 3;
}
ksize = match hash_function {
HashFunctions::Murmur64Protein | HashFunctions::Murmur64Dayhoff | HashFunctions::Murmur64Hp => ksize / 3,
_ => ksize,
};

Self {
internal_location: path.into(),
Expand Down Expand Up @@ -329,6 +332,9 @@ mod test {
use tempfile::TempDir;

use super::Manifest;
use crate::collection::Collection;
use crate::encodings::HashFunctions;
use crate::selection::{Select, Selection};

#[test]
fn manifest_from_pathlist() {
Expand Down Expand Up @@ -405,7 +411,7 @@ mod test {
}

#[test]
fn test_manifest_to_writer_bools() {
fn manifest_to_writer_bools() {
let base_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));

let test_sigs = vec![
Expand Down Expand Up @@ -441,4 +447,37 @@ mod test {
}
}
}

#[test]
fn manifest_selection() {
let base_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));

let test_sigs = vec![PathBuf::from("../../tests/test-data/prot/all.zip")];

let full_paths: Vec<PathBuf> = test_sigs
.into_iter()
.map(|sig| base_path.join(sig))
.collect();

let collection = Collection::from_zipfile(&full_paths[0]).unwrap();
let manifest = collection.manifest().clone();

// check selection on manifest works
let mut selection = Selection::default();
selection.set_ksize(19);
let prot_collect = manifest.select(&selection).unwrap();
// eprintln!("{}", &prot_collect);
assert_eq!(prot_collect.len(), 6);
selection.set_moltype(HashFunctions::Murmur64Protein);

let manifest = collection.manifest().clone();
let protein_only = manifest.select(&selection).unwrap();
assert_eq!(protein_only.len(), 2);

let manifest = collection.manifest().clone();
selection = Selection::default();
selection.set_scaled(100);
let scaled100 = manifest.select(&selection).unwrap();
assert_eq!(scaled100.len(), 6);
}
}
89 changes: 88 additions & 1 deletion src/core/src/signature.rs
Original file line number Diff line number Diff line change
Expand Up @@ -801,7 +801,13 @@ impl Select for Signature {
let mut valid = true;
valid = if let Some(ksize) = selection.ksize() {
let k = s.ksize() as u32;
k == ksize || k == ksize * 3
let adjusted_ksize = match s.hash_function() {
HashFunctions::Murmur64Protein
| HashFunctions::Murmur64Dayhoff
| HashFunctions::Murmur64Hp => ksize * 3,
_ => ksize,
};
k == adjusted_ksize
} else {
valid
};
Expand Down Expand Up @@ -1152,6 +1158,87 @@ mod test {
}
}

#[test]
fn selection_protein() {
let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
filename.push(
"../../tests/test-data/prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig",
);

let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");

// create Selection object
let mut selection = Selection::default();
let prot_ksize = 19;
selection.set_ksize(prot_ksize);
let selected_sig = sigs[0].clone().select(&selection).unwrap();
let mh = selected_sig.minhash().unwrap();
assert_eq!(mh.ksize(), prot_ksize as usize * 3);
}

#[test]
fn selection_dayhoff() {
let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
filename.push(
"../../tests/test-data/prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig",
);

let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");

// create Selection object
let mut selection = Selection::default();
let prot_ksize = 19;
selection.set_ksize(prot_ksize);
selection.set_moltype(crate::encodings::HashFunctions::Murmur64Dayhoff);
let selected_sig = sigs[0].clone().select(&selection).unwrap();
let mh = selected_sig.minhash().unwrap();
assert_eq!(mh.ksize(), prot_ksize as usize * 3);
}

#[test]
fn selection_hp() {
let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
filename
.push("../../tests/test-data/prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig");

let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");

// create Selection object
let mut selection = Selection::default();
let prot_ksize = 19;
selection.set_ksize(prot_ksize);
selection.set_moltype(crate::encodings::HashFunctions::Murmur64Hp);
let selected_sig = sigs[0].clone().select(&selection).unwrap();
let mh = selected_sig.minhash().unwrap();
assert_eq!(mh.ksize(), prot_ksize as usize * 3);
}

#[test]
fn selection_protein2() {
let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
filename.push(
"../../tests/test-data/prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig",
);

let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");

// create Selection object
let mut selection = Selection::default();
let prot_ksize = 19;
selection.set_ksize(prot_ksize * 3);
let selected_sig = sigs[0].clone().select(&selection).unwrap();
let mh = selected_sig.minhash();
assert!(mh.is_none());
}

#[test]
fn selection_scaled_too_low() {
let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
Expand Down

0 comments on commit 6db763c

Please sign in to comment.