feat: introduces liftover capabilities

This commit is rebased and comprised of the following commits: * feat: introduces liftover (sans correct negative strand lookups) * feat: adds in correct negative strand lookups * docs: updates documentation and slightly reorganizes modules
stjude-rust-labs · Apr 24, 2023 · 4516665 · 4516665
1 parent 5cd0539
commit 4516665
Show file tree

Hide file tree

Showing 21 changed files with 3,516 additions and 43 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -10,3 +10,7 @@ version = "0.1.0"
 
 [dev-dependencies]
 flate2 = "1.0.25"
+noodles = { version = "0.35.0", features = ["core", "fasta"] }
+
+[dependencies]
+rust-lapper = "1.1.0"
diff --git a/examples/chain_check.rs b/examples/chain_check.rs
@@ -0,0 +1,223 @@
+//! This example is a very simple approach to testing the validity of a chain
+//! file. To run the example, you'll need a reference FASTA, a query FASTA, as
+//! well as a chain file lifting over from the reference to the query. You can
+//! call the program like so:
+//!
+//! ```
+//! cargo run --release --example chain_check <CHAIN> <REFERENCE_FASTA> <QUERY_FASTA>
+//! ```
+//!
+//! If all goes well, the program will print out the total number of positions
+//! compared, the total number of positions with mismatches, and the overall
+//! match rate between the two genomes (via the chain file). In some cases, you
+//! will experience errors: this indicates a malformed chain file or incorrect
+//! pairings of chain file/reference FASTA/query FASTA.
+//!
+//! Note that this program somewhat simplifies what a "match" between the two
+//! genomes means. For example, all nucleotides within both FASTA files are
+//! mapped to 'A', 'C', 'G', 'T', or 'Other'. Lowercase and uppercase letters in
+//! the FASTA file have semantic meaning, but for the purposes of this
+//! comparison, they are considered the same.
+
+use std::collections::HashMap;
+use std::env;
+use std::fs::File;
+use std::io::BufReader;
+use std::iter::zip;
+
+use chain::core::Coordinate;
+use chain::core::Interval;
+use chain::core::Position;
+use chain::core::Strand;
+use chain::liftover;
+use chainfile as chain;
+use flate2::read::GzDecoder;
+use noodles::fasta;
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let mut total_positions = 0usize;
+    let mut mismatches = 0usize;
+
+    let chain_path = env::args().nth(1).expect("missing chain path");
+    let reference_path = env::args().nth(2).expect("missing reference path");
+    let query_path = env::args().nth(3).expect("missing query path");
+
+    let chain = File::open(chain_path)
+        .map(GzDecoder::new)
+        .map(BufReader::new)
+        .map(chain::Reader::new)?;
+
+    let machine = liftover::machine::builder::Builder::default().try_build_from(chain)?;
+
+    let mut reference = HashMap::new();
+    for result in fasta::reader::Builder::default()
+        .build_from_path(reference_path)?
+        .records()
+    {
+        let record = result?;
+        reference.insert(record.name().to_string(), record.sequence().clone());
+    }
+
+    let mut query = HashMap::new();
+    for result in fasta::reader::Builder::default()
+        .build_from_path(query_path)?
+        .records()
+    {
+        let record = result?;
+        query.insert(record.name().to_string(), record.sequence().clone());
+    }
+
+    for (name, reference_sequence) in reference {
+        let interval = Interval::try_new(
+            Coordinate::try_new(name.clone(), 0, Strand::Positive)?,
+            Coordinate::try_new(name, reference_sequence.len(), Strand::Positive)?,
+        )?;
+
+        let results = match machine.liftover(&interval) {
+            Some(results) => results,
+            None => {
+                println!(
+                    "INFO: region {} is not included in the query genome.",
+                    interval
+                );
+                continue;
+            }
+        };
+
+        for region in results {
+            let query_sequence = query.get(region.query().contig()).unwrap_or_else(|| {
+                panic!(
+                    "query fasta did not contain necessary contig: {}. \
+                    Are the FASTA files and chain file correct?",
+                    region.query().contig()
+                )
+            });
+
+            let reference = get_sequence_for_interval(&reference_sequence, region.reference());
+            let query = get_sequence_for_interval(query_sequence, region.query());
+
+            assert_eq!(reference.len(), query.len());
+            total_positions += reference.len();
+            mismatches += count_mismatches(&reference, &query);
+        }
+    }
+
+    let result = (1.0 - (mismatches as f64 / total_positions as f64)) * 100.0;
+    println!("Total mismatches found: {}", mismatches);
+    println!("Total positions checked: {}", total_positions);
+    println!("Percentage match for lifted regions: {:.1}%", result);
+
+    Ok(())
+}
+
+fn get_sequence_for_interval(
+    sequence: &fasta::record::Sequence,
+    interval: &chain::core::Interval,
+) -> Vec<Nucleotide> {
+    let result = sequence
+        .slice(parse_interval(interval))
+        .expect("sequence lookup to succeed. Are the FASTA files and chain file correct?")
+        .as_ref()
+        .iter()
+        .map(Nucleotide::from)
+        .collect::<Vec<_>>();
+
+    match interval.strand() {
+        Strand::Positive => result,
+        Strand::Negative => result
+            .into_iter()
+            .rev()
+            .map(|n| n.complement())
+            .collect::<Vec<_>>(),
+    }
+}
+
+fn parse_interval(interval: &chain::core::Interval) -> noodles::core::region::Interval {
+    let (start, end) = match interval.strand() {
+        Strand::Positive => {
+            let start = match interval.start().position() {
+                Position::ZeroBased(position) => {
+                    noodles::core::Position::try_from(*position + 1).unwrap()
+                }
+                Position::NegativeBound => unreachable!(),
+            };
+
+            let end = match interval.end().position() {
+                Position::ZeroBased(position) => {
+                    noodles::core::Position::try_from(*position).unwrap()
+                }
+                Position::NegativeBound => unreachable!(),
+            };
+
+            (start, end)
+        }
+        Strand::Negative => {
+            let start = match interval.end().position() {
+                Position::ZeroBased(position) => {
+                    noodles::core::Position::try_from(*position + 2).unwrap()
+                }
+                Position::NegativeBound => noodles::core::Position::try_from(1).unwrap(),
+            };
+
+            let end = match interval.start().position() {
+                Position::ZeroBased(position) => {
+                    noodles::core::Position::try_from(*position + 1).unwrap()
+                }
+                Position::NegativeBound => unreachable!(),
+            };
+
+            (start, end)
+        }
+    };
+
+    noodles::core::region::Interval::from(start..=end)
+}
+
+#[derive(Debug, Eq, PartialEq)]
+pub enum Nucleotide {
+    A,
+    C,
+    G,
+    T,
+    Other,
+}
+
+impl Nucleotide {
+    pub fn complement(self) -> Nucleotide {
+        match self {
+            Nucleotide::A => Nucleotide::T,
+            Nucleotide::C => Nucleotide::G,
+            Nucleotide::G => Nucleotide::C,
+            Nucleotide::T => Nucleotide::A,
+            Nucleotide::Other => Nucleotide::Other,
+        }
+    }
+}
+
+impl From<&u8> for Nucleotide {
+    fn from(value: &u8) -> Self {
+        match value {
+            b'A' => Nucleotide::A,
+            b'a' => Nucleotide::A,
+            b'C' => Nucleotide::C,
+            b'c' => Nucleotide::C,
+            b'G' => Nucleotide::G,
+            b'g' => Nucleotide::G,
+            b'T' => Nucleotide::T,
+            b't' => Nucleotide::T,
+            _ => Nucleotide::Other,
+        }
+    }
+}
+
+pub fn count_mismatches(reference: &[Nucleotide], query: &[Nucleotide]) -> usize {
+    let mut result = 0;
+
+    for (r, q) in zip(reference.iter(), query.iter()) {
+        if r != q {
+            result += 1;
+        }
+    }
+
+    result
+}
diff --git a/examples/chain_interrogate.rs b/examples/chain_interrogate.rs
@@ -0,0 +1,50 @@
+use std::env;
+use std::fs::File;
+use std::io::BufReader;
+
+use chain::core::Coordinate;
+use chain::core::Interval;
+use chainfile as chain;
+use flate2::read::GzDecoder;
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let interval = env::args().nth(1).expect("missing interval");
+    let src = env::args().nth(2).expect("missing src");
+
+    let interval = interval.parse::<Interval>()?;
+
+    let mut reader = File::open(src)
+        .map(GzDecoder::new)
+        .map(BufReader::new)
+        .map(chain::Reader::new)?;
+
+    for result in reader.sections() {
+        let section = result?;
+        let section_interval = Interval::try_new(
+            Coordinate::try_new(
+                section
+                    .header()
+                    .reference_sequence()
+                    .chromosome_name()
+                    .clone(),
+                section.header().reference_sequence().alignment_start(),
+                section.header().reference_sequence().strand().clone(),
+            )?,
+            Coordinate::try_new(
+                section
+                    .header()
+                    .reference_sequence()
+                    .chromosome_name()
+                    .clone(),
+                section.header().reference_sequence().alignment_end(),
+                section.header().reference_sequence().strand().clone(),
+            )?,
+        )?;
+
+        if section_interval.contains(interval.start()) {
+            println!("{}", section.header());
+        }
+    }
+
+    Ok(())
+}
diff --git a/examples/chain_stepthrough.rs b/examples/chain_stepthrough.rs
@@ -0,0 +1,26 @@
+use std::env;
+use std::fs::File;
+use std::io::BufReader;
+
+use chainfile as chain;
+use flate2::read::GzDecoder;
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let src = env::args().nth(1).expect("missing src");
+
+    let mut reader = File::open(src)
+        .map(GzDecoder::new)
+        .map(BufReader::new)
+        .map(chain::Reader::new)?;
+
+    for result in reader.sections() {
+        let section = result?;
+
+        for result in section.stepthrough()? {
+            let pair = result?;
+            println!("{:?} -> {:?}", pair.reference(), pair.query());
+        }
+    }
+
+    Ok(())
+}
diff --git a/examples/liftover.rs b/examples/liftover.rs
@@ -0,0 +1,34 @@
+use std::env;
+use std::fs::File;
+use std::io::BufReader;
+
+use chain::core::Interval;
+use chain::liftover;
+use chainfile as chain;
+use flate2::read::GzDecoder;
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let interval = env::args().nth(1).expect("missing interval");
+    let src = env::args().nth(2).expect("missing src");
+
+    let interval = interval.parse::<Interval>()?;
+
+    let reader = File::open(src)
+        .map(GzDecoder::new)
+        .map(BufReader::new)
+        .map(chain::Reader::new)?;
+
+    let machine = liftover::machine::builder::Builder::default().try_build_from(reader)?;
+    let results = machine.liftover(&interval);
+
+    match results {
+        Some(results) => {
+            for result in results {
+                println!("{}", result);
+            }
+        }
+        None => println!("Does not exist in new genome"),
+    }
+
+    Ok(())
+}
diff --git a/src/core.rs b/src/core.rs
@@ -0,0 +1,11 @@
+//! Core functionality used across the crate.
+
+pub mod coordinate;
+pub mod interval;
+pub mod strand;
+
+pub use coordinate::Contig;
+pub use coordinate::Coordinate;
+pub use coordinate::Position;
+pub use interval::Interval;
+pub use strand::Strand;