Skip to content

Commit

Permalink
feat: introduces liftover capabilities
Browse files Browse the repository at this point in the history
This commit is rebased and comprised of the following commits:

* feat: introduces liftover (sans correct negative strand lookups)
* feat: adds in correct negative strand lookups
* docs: updates documentation and slightly reorganizes modules
  • Loading branch information
claymcleod committed Apr 24, 2023
1 parent 5cd0539 commit 4516665
Show file tree
Hide file tree
Showing 21 changed files with 3,516 additions and 43 deletions.
4 changes: 4 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,7 @@ version = "0.1.0"

[dev-dependencies]
flate2 = "1.0.25"
noodles = { version = "0.35.0", features = ["core", "fasta"] }

[dependencies]
rust-lapper = "1.1.0"
223 changes: 223 additions & 0 deletions examples/chain_check.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
//! This example is a very simple approach to testing the validity of a chain
//! file. To run the example, you'll need a reference FASTA, a query FASTA, as
//! well as a chain file lifting over from the reference to the query. You can
//! call the program like so:
//!
//! ```
//! cargo run --release --example chain_check <CHAIN> <REFERENCE_FASTA> <QUERY_FASTA>
//! ```
//!
//! If all goes well, the program will print out the total number of positions
//! compared, the total number of positions with mismatches, and the overall
//! match rate between the two genomes (via the chain file). In some cases, you
//! will experience errors: this indicates a malformed chain file or incorrect
//! pairings of chain file/reference FASTA/query FASTA.
//!
//! Note that this program somewhat simplifies what a "match" between the two
//! genomes means. For example, all nucleotides within both FASTA files are
//! mapped to 'A', 'C', 'G', 'T', or 'Other'. Lowercase and uppercase letters in
//! the FASTA file have semantic meaning, but for the purposes of this
//! comparison, they are considered the same.

use std::collections::HashMap;
use std::env;
use std::fs::File;
use std::io::BufReader;
use std::iter::zip;

use chain::core::Coordinate;
use chain::core::Interval;
use chain::core::Position;
use chain::core::Strand;
use chain::liftover;
use chainfile as chain;
use flate2::read::GzDecoder;
use noodles::fasta;

fn main() -> Result<(), Box<dyn std::error::Error>> {
let mut total_positions = 0usize;
let mut mismatches = 0usize;

let chain_path = env::args().nth(1).expect("missing chain path");
let reference_path = env::args().nth(2).expect("missing reference path");
let query_path = env::args().nth(3).expect("missing query path");

let chain = File::open(chain_path)
.map(GzDecoder::new)
.map(BufReader::new)
.map(chain::Reader::new)?;

let machine = liftover::machine::builder::Builder::default().try_build_from(chain)?;

let mut reference = HashMap::new();
for result in fasta::reader::Builder::default()
.build_from_path(reference_path)?
.records()
{
let record = result?;
reference.insert(record.name().to_string(), record.sequence().clone());
}

let mut query = HashMap::new();
for result in fasta::reader::Builder::default()
.build_from_path(query_path)?
.records()
{
let record = result?;
query.insert(record.name().to_string(), record.sequence().clone());
}

for (name, reference_sequence) in reference {
let interval = Interval::try_new(
Coordinate::try_new(name.clone(), 0, Strand::Positive)?,
Coordinate::try_new(name, reference_sequence.len(), Strand::Positive)?,
)?;

let results = match machine.liftover(&interval) {
Some(results) => results,
None => {
println!(
"INFO: region {} is not included in the query genome.",
interval
);
continue;
}
};

for region in results {
let query_sequence = query.get(region.query().contig()).unwrap_or_else(|| {
panic!(
"query fasta did not contain necessary contig: {}. \
Are the FASTA files and chain file correct?",
region.query().contig()
)
});

let reference = get_sequence_for_interval(&reference_sequence, region.reference());
let query = get_sequence_for_interval(query_sequence, region.query());

assert_eq!(reference.len(), query.len());
total_positions += reference.len();
mismatches += count_mismatches(&reference, &query);
}
}

let result = (1.0 - (mismatches as f64 / total_positions as f64)) * 100.0;
println!("Total mismatches found: {}", mismatches);
println!("Total positions checked: {}", total_positions);
println!("Percentage match for lifted regions: {:.1}%", result);

Ok(())
}

fn get_sequence_for_interval(
sequence: &fasta::record::Sequence,
interval: &chain::core::Interval,
) -> Vec<Nucleotide> {
let result = sequence
.slice(parse_interval(interval))
.expect("sequence lookup to succeed. Are the FASTA files and chain file correct?")
.as_ref()
.iter()
.map(Nucleotide::from)
.collect::<Vec<_>>();

match interval.strand() {
Strand::Positive => result,
Strand::Negative => result
.into_iter()
.rev()
.map(|n| n.complement())
.collect::<Vec<_>>(),
}
}

fn parse_interval(interval: &chain::core::Interval) -> noodles::core::region::Interval {
let (start, end) = match interval.strand() {
Strand::Positive => {
let start = match interval.start().position() {
Position::ZeroBased(position) => {
noodles::core::Position::try_from(*position + 1).unwrap()
}
Position::NegativeBound => unreachable!(),
};

let end = match interval.end().position() {
Position::ZeroBased(position) => {
noodles::core::Position::try_from(*position).unwrap()
}
Position::NegativeBound => unreachable!(),
};

(start, end)
}
Strand::Negative => {
let start = match interval.end().position() {
Position::ZeroBased(position) => {
noodles::core::Position::try_from(*position + 2).unwrap()
}
Position::NegativeBound => noodles::core::Position::try_from(1).unwrap(),
};

let end = match interval.start().position() {
Position::ZeroBased(position) => {
noodles::core::Position::try_from(*position + 1).unwrap()
}
Position::NegativeBound => unreachable!(),
};

(start, end)
}
};

noodles::core::region::Interval::from(start..=end)
}

#[derive(Debug, Eq, PartialEq)]
pub enum Nucleotide {
A,
C,
G,
T,
Other,
}

impl Nucleotide {
pub fn complement(self) -> Nucleotide {
match self {
Nucleotide::A => Nucleotide::T,
Nucleotide::C => Nucleotide::G,
Nucleotide::G => Nucleotide::C,
Nucleotide::T => Nucleotide::A,
Nucleotide::Other => Nucleotide::Other,
}
}
}

impl From<&u8> for Nucleotide {
fn from(value: &u8) -> Self {
match value {
b'A' => Nucleotide::A,
b'a' => Nucleotide::A,
b'C' => Nucleotide::C,
b'c' => Nucleotide::C,
b'G' => Nucleotide::G,
b'g' => Nucleotide::G,
b'T' => Nucleotide::T,
b't' => Nucleotide::T,
_ => Nucleotide::Other,
}
}
}

pub fn count_mismatches(reference: &[Nucleotide], query: &[Nucleotide]) -> usize {
let mut result = 0;

for (r, q) in zip(reference.iter(), query.iter()) {
if r != q {
result += 1;
}
}

result
}
50 changes: 50 additions & 0 deletions examples/chain_interrogate.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
use std::env;
use std::fs::File;
use std::io::BufReader;

use chain::core::Coordinate;
use chain::core::Interval;
use chainfile as chain;
use flate2::read::GzDecoder;

fn main() -> Result<(), Box<dyn std::error::Error>> {
let interval = env::args().nth(1).expect("missing interval");
let src = env::args().nth(2).expect("missing src");

let interval = interval.parse::<Interval>()?;

let mut reader = File::open(src)
.map(GzDecoder::new)
.map(BufReader::new)
.map(chain::Reader::new)?;

for result in reader.sections() {
let section = result?;
let section_interval = Interval::try_new(
Coordinate::try_new(
section
.header()
.reference_sequence()
.chromosome_name()
.clone(),
section.header().reference_sequence().alignment_start(),
section.header().reference_sequence().strand().clone(),
)?,
Coordinate::try_new(
section
.header()
.reference_sequence()
.chromosome_name()
.clone(),
section.header().reference_sequence().alignment_end(),
section.header().reference_sequence().strand().clone(),
)?,
)?;

if section_interval.contains(interval.start()) {
println!("{}", section.header());
}
}

Ok(())
}
26 changes: 26 additions & 0 deletions examples/chain_stepthrough.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
use std::env;
use std::fs::File;
use std::io::BufReader;

use chainfile as chain;
use flate2::read::GzDecoder;

fn main() -> Result<(), Box<dyn std::error::Error>> {
let src = env::args().nth(1).expect("missing src");

let mut reader = File::open(src)
.map(GzDecoder::new)
.map(BufReader::new)
.map(chain::Reader::new)?;

for result in reader.sections() {
let section = result?;

for result in section.stepthrough()? {
let pair = result?;
println!("{:?} -> {:?}", pair.reference(), pair.query());
}
}

Ok(())
}
34 changes: 34 additions & 0 deletions examples/liftover.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
use std::env;
use std::fs::File;
use std::io::BufReader;

use chain::core::Interval;
use chain::liftover;
use chainfile as chain;
use flate2::read::GzDecoder;

fn main() -> Result<(), Box<dyn std::error::Error>> {
let interval = env::args().nth(1).expect("missing interval");
let src = env::args().nth(2).expect("missing src");

let interval = interval.parse::<Interval>()?;

let reader = File::open(src)
.map(GzDecoder::new)
.map(BufReader::new)
.map(chain::Reader::new)?;

let machine = liftover::machine::builder::Builder::default().try_build_from(reader)?;
let results = machine.liftover(&interval);

match results {
Some(results) => {
for result in results {
println!("{}", result);
}
}
None => println!("Does not exist in new genome"),
}

Ok(())
}
11 changes: 11 additions & 0 deletions src/core.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
//! Core functionality used across the crate.

pub mod coordinate;
pub mod interval;
pub mod strand;

pub use coordinate::Contig;
pub use coordinate::Coordinate;
pub use coordinate::Position;
pub use interval::Interval;
pub use strand::Strand;

0 comments on commit 4516665

Please sign in to comment.