From 3bfdc40d16f8460da455da771dcc304fb4b71e95 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Thu, 25 Mar 2021 15:49:30 -0700 Subject: [PATCH] add documentation and unique_kmers changes from #1009 --- doc/developer.md | 55 ++++++++++++++++++++++++++++++++ src/core/src/sketch/nodegraph.rs | 4 ++- 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/doc/developer.md b/doc/developer.md index f21ee4b03..fb9abaa07 100644 --- a/doc/developer.md +++ b/doc/developer.md @@ -263,6 +263,61 @@ For the Rust core library we use `rMAJOR.MINOR.PATCH` The Rust version is not automated, and must be bumped in `src/core/Cargo.toml`. +## Nodegraph compatibility with khmer + +For more information, check the [binary formats](https://khmer.readthedocs.io/en/latest/dev/binary-file-formats.html) section in khmer. + +### Version 4 (same as khmer) + +The header is in the format below, again in the order of file offset. Value +macro definitions are given in parenthesis + +| Field | Len | Off | Value | +| ----------------- | --- | --- | ------------------------------------------- | +| Magic string | 4 | 0 | ``OXLI`` (``SAVED_SIGNATURE``) | +| Version | 1 | 4 | ``0x04`` (``SAVED_FORMAT_VERSION``) | +| File Type | 1 | 5 | ``0x02`` (``SAVED_HASHBITS``) | +| K-size | 4 | 6 | k-mer length. [``unsigned int``] | +| Number of Tables | 1 | 10 | Number of Nodegraph tables. [``uint8_t``] | +| Occupied Bins | 8 | 11 | Number of occupied bins | + +Then follows the Nodegraph's tables. For each table: + +| Field | Len | Off | Value | +| ----------------- | ------ | --- | -------------------------------------------- | +| Table size | 8 | 0 | Length of table, **in bits** (``uint64_t``). | +| Bins | N/8+1 | 8 | This table's bytes, length given by previous field, divided by 8, plus 1 (``uint8_t``). | + +### Version 5 + +Version 5 is a new version incompatible with the khmer Nodegraphs because it uses +[BitMagic](http://bitmagic.io) for saving the tables. +It also includes the number of unique kmers, +something that both khmer and sourmash calculate when adding new elements +but don't serialize to the binary format in version 4. + +The header is in the format below, again in the order of file offset. Value +macro definitions are given in parenthesis + +| Field | Len | Off | Value | +| ----------------- | --- | --- | ----------------------------------------- | +| Magic string | 4 | 0 | ``OXLI`` (``SAVED_SIGNATURE``) | +| Version | 1 | 4 | ``0x04`` (``SAVED_FORMAT_VERSION``) | +| File Type | 1 | 5 | ``0x02`` (``SAVED_HASHBITS``) | +| K-size | 4 | 6 | k-mer length. [``unsigned int``] | +| Unique k-mers | 8 | 10 | Number of unique k-mers. [``uint64_t``] | +| Number of Tables | 1 | 10 | Number of Nodegraph tables. [``uint8_t``] | +| Occupied Bins | 8 | 11 | Number of occupied bins | + +Then follows the Nodegraph's tables. Each table is serialized using the +BitMagic format, and must be deserialized using its deserializing methods. +For each table: + +| Field | Len | Off | Value | +| ----------------- | --- | --- | -------------------------------------------- | +| Table size | 8 | 0 | Length of table, **in bytes** (``uint8_t``). | +| Bins | N | 8 | This table's BitMagic bit-vector. Length given by previous field (``BVector``). | + ## Common errors and solutions ### Cannot import name `to_bytes` from `sourmash.minhash` diff --git a/src/core/src/sketch/nodegraph.rs b/src/core/src/sketch/nodegraph.rs index 3283f0160..12e362f32 100644 --- a/src/core/src/sketch/nodegraph.rs +++ b/src/core/src/sketch/nodegraph.rs @@ -190,6 +190,7 @@ impl Nodegraph { wtr.write_u8(5)?; // version wtr.write_u8(2)?; // ht_type wtr.write_u32::(self.ksize as u32)?; // ksize + wtr.write_u64::(self.unique_kmers as u64)?; // unique kmers wtr.write_u8(self.bs.len() as u8)?; // n_tables wtr.write_u64::(self.occupied_bins as u64)?; // n_occupied for count in &self.bs { @@ -327,6 +328,7 @@ impl Nodegraph { assert_eq!(ht_type, 0x02); let ksize = rdr.read_u32::()?; + let unique_kmers = rdr.read_u64::()?; let n_tables = rdr.read_u8()?; let occupied_bins = rdr.read_u64::()? as usize; @@ -344,7 +346,7 @@ impl Nodegraph { bs, ksize: ksize as usize, occupied_bins, - unique_kmers: 0, // This is a khmer issue, it doesn't save unique_kmers + unique_kmers: unique_kmers as usize, }) }