Skip to content

Commit

Permalink
Merge efef57f into 796a583
Browse files Browse the repository at this point in the history
  • Loading branch information
anp committed Feb 5, 2017
2 parents 796a583 + efef57f commit e07eef5
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 9 deletions.
2 changes: 1 addition & 1 deletion benches/fmindex.rs
Expand Up @@ -15,7 +15,7 @@ fn search_index_seeds(b: &mut Bencher) {
let sa = suffix_array(STR_1);
let bwt = bwt(STR_1, &sa);
let less = less(&bwt, &alphabet);
let occ = Occ::new(&bwt, 3, &alphabet);
let occ = Occ::new(&bwt, 128, &alphabet);
let fmindex = FMIndex::new(&bwt, &less, &occ);

b.iter(|| {
Expand Down
51 changes: 43 additions & 8 deletions src/data_structures/bwt.rs
Expand Up @@ -89,7 +89,7 @@ pub fn invert_bwt(bwt: &BWTSlice) -> Vec<u8> {
#[cfg_attr(feature = "serde_macros", derive(Serialize, Deserialize))]
pub struct Occ {
occ: Vec<Vec<usize>>,
k: usize,
k: u32,
}


Expand All @@ -105,14 +105,14 @@ impl Occ {
///
/// * `bwt` - the BWT
/// * `k` - the sampling rate: every k-th entry will be stored
pub fn new(bwt: &BWTSlice, k: usize, alphabet: &Alphabet) -> Self {
pub fn new(bwt: &BWTSlice, k: u32, alphabet: &Alphabet) -> Self {
let n = bwt.len();
let m = alphabet.max_symbol().expect("Expecting non-empty alphabet.") as usize + 1;
let mut occ = Vec::with_capacity(n / k);
let mut occ = Vec::with_capacity(n / k as usize);
let mut curr_occ: Vec<usize> = repeat(0).take(m).collect();
for (i, &c) in bwt.iter().enumerate() {
curr_occ[c as usize] += 1;
if i % k == 0 {
if i % k as usize == 0 {
occ.push(curr_occ.clone());
}
}
Expand All @@ -123,11 +123,46 @@ impl Occ {
/// Get occurrence count of symbol a in BWT[..r+1].
/// Complexity: O(k).
pub fn get(&self, bwt: &BWTSlice, r: usize, a: u8) -> usize {
let i = r / self.k;

let count = bwt[(i * self.k) + 1..r + 1].iter().filter(|&&c| c == a).count();
// NOTE:
//
// Retrieving byte match counts in this function is critical to the performance of FM Index.
//
// The below manual count code is roughly equivalent to:
// ```
// let count = bwt[(i * self.k) + 1..r + 1].iter().filter(|&&c| c == a).count();
// self.occ[i][a as usize] + count
// ```
//
// But there are a couple of reasons to do this manually:
// 1) As of 2016, versions of rustc/LLVM vectorize this manual loop more reliably
// than the iterator adapter version.
// 2) Manually accumulating the byte match count in a single chunk can allows
// us to use a `u32` for that count, which has faster arithmetic on common arches.
// This does necessitate storing `k` as a u32.
//
// See the conversation in these issues for some of the history here:
//
// https://github.com/rust-bio/rust-bio/pull/74
// https://github.com/rust-bio/rust-bio/pull/76

// self.k is our sampling rate, so find our last sampled checkpoint
let i = r / self.k as usize;
let checkpoint = self.occ[i][a as usize];

// find the portion of the BWT past the checkpoint which we need to count
let start = (i * self.k as usize) + 1;
let end = r + 1;

// count all the matching bytes b/t the closest checkpoint and our desired lookup
let mut count: u32 = 0;
for &x in &bwt[start..end] {
if x == a {
count += 1;
}
}

self.occ[i][a as usize] + count
// return the sampled checkpoint for this character + the manual count we just did
checkpoint + (count as usize)
}
}

Expand Down

0 comments on commit e07eef5

Please sign in to comment.