Skip to content

Commit

Permalink
Allow changing storage location for a collection in RevIndex (#3015)
Browse files Browse the repository at this point in the history
As part of sourmash-bio/branchwater#4 I hit a
pretty big drawback on the `RevIndex::open` API: it has no way of
dealing with the `Collection` storage moving to another location.

This also means that the index is non-relocatable if the `Collection`
uses a relative path to where the index was built originally =(

I started by making a test that build an index, move the sigs to another
location, then try to open the index without passing a new location
(should error), and then passing an updated location (should work).

This break semantic versioning (added argument to `open`), so requires a
version bump. Since it will be a version bump, I also added the
`[non_exhaustive]` annotation to `SourmashError`, even tho this PR is
not updating it, because it would always be a breaking change if we add
a new error and don't require users to check for all cases (if matching
explicitly).

---

I didn't dump the RevIndex version because there are no changes to the
file format, should be compatible with existing DBs
  • Loading branch information
luizirber committed Feb 20, 2024
1 parent 42639e4 commit 250b4a3
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 11 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion src/core/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "sourmash"
version = "0.12.1"
version = "0.13.0"
authors = ["Luiz Irber <luiz.irber@gmail.com>"]
description = "MinHash sketches for genomic data"
repository = "https://github.com/sourmash-bio/sourmash"
Expand Down
1 change: 1 addition & 0 deletions src/core/src/errors.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use thiserror::Error;

#[derive(Debug, Error)]
#[non_exhaustive]
pub enum SourmashError {
/// Raised for internal errors in the libraries. Should not happen.
#[error("internal error: {message:?}")]
Expand Down
25 changes: 20 additions & 5 deletions src/core/src/index/revindex/disk_revindex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,11 @@ impl RevIndex {
Ok(module::RevIndex::Plain(index))
}

pub fn open<P: AsRef<Path>>(path: P, read_only: bool) -> Result<module::RevIndex> {
pub fn open<P: AsRef<Path>>(
path: P,
read_only: bool,
storage_spec: Option<&str>,
) -> Result<module::RevIndex> {
let mut opts = module::RevIndex::db_options();
if !read_only {
opts.prepare_for_bulk_load();
Expand All @@ -120,12 +124,18 @@ impl RevIndex {
Arc::new(DB::open_cf_descriptors(&opts, path.as_ref(), cfs)?)
};

let collection = Arc::new(Self::load_collection_from_rocksdb(db.clone())?);
let collection = Arc::new(Self::load_collection_from_rocksdb(
db.clone(),
storage_spec,
)?);

Ok(module::RevIndex::Plain(Self { db, collection }))
}

fn load_collection_from_rocksdb(db: Arc<DB>) -> Result<CollectionSet> {
fn load_collection_from_rocksdb(
db: Arc<DB>,
storage_spec: Option<&str>,
) -> Result<CollectionSet> {
let cf_metadata = db.cf_handle(METADATA).unwrap();

let rdr = db.get_cf(&cf_metadata, VERSION)?.unwrap();
Expand All @@ -134,8 +144,13 @@ impl RevIndex {
let rdr = db.get_cf(&cf_metadata, MANIFEST)?.unwrap();
let manifest = Manifest::from_reader(&rdr[..])?;

let spec = String::from_utf8(db.get_cf(&cf_metadata, STORAGE_SPEC)?.unwrap())
.expect("invalid utf-8");
let spec = match storage_spec {
Some(spec) => spec.into(),
None => {
let db_spec = db.get_cf(&cf_metadata, STORAGE_SPEC)?;
String::from_utf8(db_spec.unwrap()).map_err(|e| e.utf8_error())?
}
};

let storage = if spec == "rocksdb://" {
todo!("init storage from db")
Expand Down
70 changes: 66 additions & 4 deletions src/core/src/index/revindex/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ impl RevIndex {
}
}

pub fn open<P: AsRef<Path>>(index: P, read_only: bool) -> Result<Self> {
pub fn open<P: AsRef<Path>>(index: P, read_only: bool, spec: Option<&str>) -> Result<Self> {
let opts = Self::db_options();
let cfs = DB::list_cf(&opts, index.as_ref()).unwrap();

Expand All @@ -194,7 +194,7 @@ impl RevIndex {
// due to pending unmerged colors
todo!() //color_revindex::ColorRevIndex::open(index, false)
} else {
disk_revindex::RevIndex::open(index, read_only)
disk_revindex::RevIndex::open(index, read_only, spec)
}
}

Expand Down Expand Up @@ -528,7 +528,8 @@ mod test {
let query = query.unwrap();

let new_collection = Collection::from_paths(&new_siglist)?.select(&selection)?;
let index = RevIndex::open(output.path(), false)?.update(new_collection.try_into()?)?;
let index =
RevIndex::open(output.path(), false, None)?.update(new_collection.try_into()?)?;

let counter = index.counter_for_query(&query);
let matches = index.matches_from_counter(counter, 0);
Expand Down Expand Up @@ -569,7 +570,7 @@ mod test {
let _index = RevIndex::create(output.path(), collection.try_into()?, false);
}

let index = RevIndex::open(output.path(), true)?;
let index = RevIndex::open(output.path(), true, None)?;

let (counter, query_colors, hash_to_color) = index.prepare_gather_counters(&query);

Expand All @@ -588,4 +589,65 @@ mod test {

Ok(())
}

#[test]
fn revindex_move() -> Result<()> {
let basedir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));

let mut zip_collection = basedir.clone();
zip_collection.push("../../tests/test-data/track_abund/track_abund.zip");

let outdir = TempDir::new()?;

let zip_copy = PathBuf::from(
outdir
.path()
.join("sigs.zip")
.into_os_string()
.into_string()
.unwrap(),
);
std::fs::copy(zip_collection, zip_copy.as_path())?;

let selection = Selection::builder().ksize(31).scaled(10000).build();
let collection = Collection::from_zipfile(zip_copy.as_path())?.select(&selection)?;
let output = outdir.path().join("index");

let query = prepare_query(collection.sig_for_dataset(0)?.into(), &selection).unwrap();

{
RevIndex::create(output.as_path(), collection.try_into()?, false)?;
}

{
let index = RevIndex::open(output.as_path(), false, None)?;

let counter = index.counter_for_query(&query);
let matches = index.matches_from_counter(counter, 0);

assert!(matches[0].0.starts_with("NC_009665.1"));
assert_eq!(matches[0].1, 514);
}

let new_zip = outdir
.path()
.join("new_sigs.zip")
.into_os_string()
.into_string()
.unwrap();
std::fs::rename(zip_copy, &new_zip)?;

// RevIndex can't know where the new sigs are
assert!(RevIndex::open(output.as_path(), false, None).is_err());

let index = RevIndex::open(output.as_path(), false, Some(&format!("zip://{}", new_zip)))?;

let counter = index.counter_for_query(&query);
let matches = index.matches_from_counter(counter, 0);

assert!(matches[0].0.starts_with("NC_009665.1"));
assert_eq!(matches[0].1, 514);

Ok(())
}
}

0 comments on commit 250b4a3

Please sign in to comment.